net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/proc_fs.h>
 101 #include <linux/seq_file.h>
 102 #include <linux/stat.h>
 103 #include <net/dst.h>
 104 #include <net/pkt_sched.h>
 105 #include <net/checksum.h>
 106 #include <net/xfrm.h>
 107 #include <linux/highmem.h>
 108 #include <linux/init.h>
 109 #include <linux/kmod.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122 #include <linux/if_vlan.h>
 123 #include <linux/ip.h>
 124 #include <net/ip.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129 #include <trace/events/napi.h>
 130 #include <trace/events/net.h>
 131 #include <trace/events/skb.h>
 132 #include <linux/pci.h>
 133 #include <linux/inetdevice.h>
 134 #include <linux/cpu_rmap.h>
 135 #include <linux/net_tstamp.h>
 136 #include <linux/static_key.h>
 137 #include <net/flow_keys.h>
 138
 139 #include "net-sysfs.h"
 140
 141 /* Instead of increasing this, you should create a hash table. */
 142 #define MAX_GRO_SKBS 8
 143
 144 /* This should be increased if a protocol with a bigger head is added. */
 145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147 /*
 148  *      The list of packet types we will receive (as opposed to discard)
 149  *      and the routines to invoke.
 150  *
 151  *      Why 16. Because with 16 the only overlap we get on a hash of the
 152  *      low nibble of the protocol value is RARP/SNAP/X.25.
 153  *
 154  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 155  *             sure which should go first, but I bet it won't make much
 156  *             difference if we are running VLANs.  The good news is that
 157  *             this protocol won't be in the list unless compiled in, so
 158  *             the average user (w/out VLANs) will not be adversely affected.
 159  *             --BLG
 160  *
 161  *              0800    IP
 162  *              8100    802.1Q VLAN
 163  *              0001    802.3
 164  *              0002    AX.25
 165  *              0004    802.2
 166  *              8035    RARP
 167  *              0005    SNAP
 168  *              0805    X.25
 169  *              0806    ARP
 170  *              8137    IPX
 171  *              0009    Localtalk
 172  *              86DD    IPv6
 173  */
 174
 175 #define PTYPE_HASH_SIZE (16)
 176 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 177
 178 static DEFINE_SPINLOCK(ptype_lock);
 179 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 180 static struct list_head ptype_all __read_mostly;        /* Taps */
 181
 182 /*
 183  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 184  * semaphore.
 185  *
 186  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 187  *
 188  * Writers must hold the rtnl semaphore while they loop through the
 189  * dev_base_head list, and hold dev_base_lock for writing when they do the
 190  * actual updates.  This allows pure readers to access the list even
 191  * while a writer is preparing to update it.
 192  *
 193  * To put it another way, dev_base_lock is held for writing only to
 194  * protect against pure readers; the rtnl semaphore provides the
 195  * protection against other writers.
 196  *
 197  * See, for example usages, register_netdevice() and
 198  * unregister_netdevice(), which must be called with the rtnl
 199  * semaphore held.
 200  */
 201 DEFINE_RWLOCK(dev_base_lock);
 202 EXPORT_SYMBOL(dev_base_lock);
 203
 204 static inline void dev_base_seq_inc(struct net *net)
 205 {
 206         while (++net->dev_base_seq == 0);
 207 }
 208
 209 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 210 {
 211         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 212         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 213 }
 214
 215 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 216 {
 217         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 218 }
 219
 220 static inline void rps_lock(struct softnet_data *sd)
 221 {
 222 #ifdef CONFIG_RPS
 223         spin_lock(&sd->input_pkt_queue.lock);
 224 #endif
 225 }
 226
 227 static inline void rps_unlock(struct softnet_data *sd)
 228 {
 229 #ifdef CONFIG_RPS
 230         spin_unlock(&sd->input_pkt_queue.lock);
 231 #endif
 232 }
 233
 234 /* Device list insertion */
 235 static int list_netdevice(struct net_device *dev)
 236 {
 237         struct net *net = dev_net(dev);
 238
 239         ASSERT_RTNL();
 240
 241         write_lock_bh(&dev_base_lock);
 242         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 243         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 244         hlist_add_head_rcu(&dev->index_hlist,
 245                            dev_index_hash(net, dev->ifindex));
 246         write_unlock_bh(&dev_base_lock);
 247
 248         dev_base_seq_inc(net);
 249
 250         return 0;
 251 }
 252
 253 /* Device list removal
 254  * caller must respect a RCU grace period before freeing/reusing dev
 255  */
 256 static void unlist_netdevice(struct net_device *dev)
 257 {
 258         ASSERT_RTNL();
 259
 260         /* Unlink dev from the device chain */
 261         write_lock_bh(&dev_base_lock);
 262         list_del_rcu(&dev->dev_list);
 263         hlist_del_rcu(&dev->name_hlist);
 264         hlist_del_rcu(&dev->index_hlist);
 265         write_unlock_bh(&dev_base_lock);
 266
 267         dev_base_seq_inc(dev_net(dev));
 268 }
 269
 270 /*
 271  *      Our notifier list
 272  */
 273
 274 static RAW_NOTIFIER_HEAD(netdev_chain);
 275
 276 /*
 277  *      Device drivers call our routines to queue packets here. We empty the
 278  *      queue in the local softnet handler.
 279  */
 280
 281 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 282 EXPORT_PER_CPU_SYMBOL(softnet_data);
 283
 284 #ifdef CONFIG_LOCKDEP
 285 /*
 286  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 287  * according to dev->type
 288  */
 289 static const unsigned short netdev_lock_type[] =
 290         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 291          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 292          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 293          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 294          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 295          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 296          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 297          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 298          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 299          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 300          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 301          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 302          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 303          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 304          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 305          ARPHRD_VOID, ARPHRD_NONE};
 306
 307 static const char *const netdev_lock_name[] =
 308         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 309          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 310          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 311          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 312          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 313          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 314          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 315          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 316          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 317          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 318          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 319          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 320          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 321          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 322          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 323          "_xmit_VOID", "_xmit_NONE"};
 324
 325 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 326 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 327
 328 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 329 {
 330         int i;
 331
 332         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 333                 if (netdev_lock_type[i] == dev_type)
 334                         return i;
 335         /* the last key is used by default */
 336         return ARRAY_SIZE(netdev_lock_type) - 1;
 337 }
 338
 339 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 340                                                  unsigned short dev_type)
 341 {
 342         int i;
 343
 344         i = netdev_lock_pos(dev_type);
 345         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 346                                    netdev_lock_name[i]);
 347 }
 348
 349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 350 {
 351         int i;
 352
 353         i = netdev_lock_pos(dev->type);
 354         lockdep_set_class_and_name(&dev->addr_list_lock,
 355                                    &netdev_addr_lock_key[i],
 356                                    netdev_lock_name[i]);
 357 }
 358 #else
 359 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 360                                                  unsigned short dev_type)
 361 {
 362 }
 363 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 364 {
 365 }
 366 #endif
 367
 368 /*******************************************************************************
 369
 370                 Protocol management and registration routines
 371
 372 *******************************************************************************/
 373
 374 /*
 375  *      Add a protocol ID to the list. Now that the input handler is
 376  *      smarter we can dispense with all the messy stuff that used to be
 377  *      here.
 378  *
 379  *      BEWARE!!! Protocol handlers, mangling input packets,
 380  *      MUST BE last in hash buckets and checking protocol handlers
 381  *      MUST start from promiscuous ptype_all chain in net_bh.
 382  *      It is true now, do not change it.
 383  *      Explanation follows: if protocol handler, mangling packet, will
 384  *      be the first on list, it is not able to sense, that packet
 385  *      is cloned and should be copied-on-write, so that it will
 386  *      change it and subsequent readers will get broken packet.
 387  *                                                      --ANK (980803)
 388  */
 389
 390 static inline struct list_head *ptype_head(const struct packet_type *pt)
 391 {
 392         if (pt->type == htons(ETH_P_ALL))
 393                 return &ptype_all;
 394         else
 395                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 396 }
 397
 398 /**
 399  *      dev_add_pack - add packet handler
 400  *      @pt: packet type declaration
 401  *
 402  *      Add a protocol handler to the networking stack. The passed &packet_type
 403  *      is linked into kernel lists and may not be freed until it has been
 404  *      removed from the kernel lists.
 405  *
 406  *      This call does not sleep therefore it can not
 407  *      guarantee all CPU's that are in middle of receiving packets
 408  *      will see the new packet type (until the next received packet).
 409  */
 410
 411 void dev_add_pack(struct packet_type *pt)
 412 {
 413         struct list_head *head = ptype_head(pt);
 414
 415         spin_lock(&ptype_lock);
 416         list_add_rcu(&pt->list, head);
 417         spin_unlock(&ptype_lock);
 418 }
 419 EXPORT_SYMBOL(dev_add_pack);
 420
 421 /**
 422  *      __dev_remove_pack        - remove packet handler
 423  *      @pt: packet type declaration
 424  *
 425  *      Remove a protocol handler that was previously added to the kernel
 426  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 427  *      from the kernel lists and can be freed or reused once this function
 428  *      returns.
 429  *
 430  *      The packet type might still be in use by receivers
 431  *      and must not be freed until after all the CPU's have gone
 432  *      through a quiescent state.
 433  */
 434 void __dev_remove_pack(struct packet_type *pt)
 435 {
 436         struct list_head *head = ptype_head(pt);
 437         struct packet_type *pt1;
 438
 439         spin_lock(&ptype_lock);
 440
 441         list_for_each_entry(pt1, head, list) {
 442                 if (pt == pt1) {
 443                         list_del_rcu(&pt->list);
 444                         goto out;
 445                 }
 446         }
 447
 448         pr_warn("dev_remove_pack: %p not found\n", pt);
 449 out:
 450         spin_unlock(&ptype_lock);
 451 }
 452 EXPORT_SYMBOL(__dev_remove_pack);
 453
 454 /**
 455  *      dev_remove_pack  - remove packet handler
 456  *      @pt: packet type declaration
 457  *
 458  *      Remove a protocol handler that was previously added to the kernel
 459  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 460  *      from the kernel lists and can be freed or reused once this function
 461  *      returns.
 462  *
 463  *      This call sleeps to guarantee that no CPU is looking at the packet
 464  *      type after return.
 465  */
 466 void dev_remove_pack(struct packet_type *pt)
 467 {
 468         __dev_remove_pack(pt);
 469
 470         synchronize_net();
 471 }
 472 EXPORT_SYMBOL(dev_remove_pack);
 473
 474 /******************************************************************************
 475
 476                       Device Boot-time Settings Routines
 477
 478 *******************************************************************************/
 479
 480 /* Boot time configuration table */
 481 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 482
 483 /**
 484  *      netdev_boot_setup_add   - add new setup entry
 485  *      @name: name of the device
 486  *      @map: configured settings for the device
 487  *
 488  *      Adds new setup entry to the dev_boot_setup list.  The function
 489  *      returns 0 on error and 1 on success.  This is a generic routine to
 490  *      all netdevices.
 491  */
 492 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 493 {
 494         struct netdev_boot_setup *s;
 495         int i;
 496
 497         s = dev_boot_setup;
 498         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 499                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 500                         memset(s[i].name, 0, sizeof(s[i].name));
 501                         strlcpy(s[i].name, name, IFNAMSIZ);
 502                         memcpy(&s[i].map, map, sizeof(s[i].map));
 503                         break;
 504                 }
 505         }
 506
 507         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 508 }
 509
 510 /**
 511  *      netdev_boot_setup_check - check boot time settings
 512  *      @dev: the netdevice
 513  *
 514  *      Check boot time settings for the device.
 515  *      The found settings are set for the device to be used
 516  *      later in the device probing.
 517  *      Returns 0 if no settings found, 1 if they are.
 518  */
 519 int netdev_boot_setup_check(struct net_device *dev)
 520 {
 521         struct netdev_boot_setup *s = dev_boot_setup;
 522         int i;
 523
 524         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 525                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 526                     !strcmp(dev->name, s[i].name)) {
 527                         dev->irq        = s[i].map.irq;
 528                         dev->base_addr  = s[i].map.base_addr;
 529                         dev->mem_start  = s[i].map.mem_start;
 530                         dev->mem_end    = s[i].map.mem_end;
 531                         return 1;
 532                 }
 533         }
 534         return 0;
 535 }
 536 EXPORT_SYMBOL(netdev_boot_setup_check);
 537
 538
 539 /**
 540  *      netdev_boot_base        - get address from boot time settings
 541  *      @prefix: prefix for network device
 542  *      @unit: id for network device
 543  *
 544  *      Check boot time settings for the base address of device.
 545  *      The found settings are set for the device to be used
 546  *      later in the device probing.
 547  *      Returns 0 if no settings found.
 548  */
 549 unsigned long netdev_boot_base(const char *prefix, int unit)
 550 {
 551         const struct netdev_boot_setup *s = dev_boot_setup;
 552         char name[IFNAMSIZ];
 553         int i;
 554
 555         sprintf(name, "%s%d", prefix, unit);
 556
 557         /*
 558          * If device already registered then return base of 1
 559          * to indicate not to probe for this interface
 560          */
 561         if (__dev_get_by_name(&init_net, name))
 562                 return 1;
 563
 564         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 565                 if (!strcmp(name, s[i].name))
 566                         return s[i].map.base_addr;
 567         return 0;
 568 }
 569
 570 /*
 571  * Saves at boot time configured settings for any netdevice.
 572  */
 573 int __init netdev_boot_setup(char *str)
 574 {
 575         int ints[5];
 576         struct ifmap map;
 577
 578         str = get_options(str, ARRAY_SIZE(ints), ints);
 579         if (!str || !*str)
 580                 return 0;
 581
 582         /* Save settings */
 583         memset(&map, 0, sizeof(map));
 584         if (ints[0] > 0)
 585                 map.irq = ints[1];
 586         if (ints[0] > 1)
 587                 map.base_addr = ints[2];
 588         if (ints[0] > 2)
 589                 map.mem_start = ints[3];
 590         if (ints[0] > 3)
 591                 map.mem_end = ints[4];
 592
 593         /* Add new entry to the list */
 594         return netdev_boot_setup_add(str, &map);
 595 }
 596
 597 __setup("netdev=", netdev_boot_setup);
 598
 599 /*******************************************************************************
 600
 601                             Device Interface Subroutines
 602
 603 *******************************************************************************/
 604
 605 /**
 606  *      __dev_get_by_name       - find a device by its name
 607  *      @net: the applicable net namespace
 608  *      @name: name to find
 609  *
 610  *      Find an interface by name. Must be called under RTNL semaphore
 611  *      or @dev_base_lock. If the name is found a pointer to the device
 612  *      is returned. If the name is not found then %NULL is returned. The
 613  *      reference counters are not incremented so the caller must be
 614  *      careful with locks.
 615  */
 616
 617 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 618 {
 619         struct hlist_node *p;
 620         struct net_device *dev;
 621         struct hlist_head *head = dev_name_hash(net, name);
 622
 623         hlist_for_each_entry(dev, p, head, name_hlist)
 624                 if (!strncmp(dev->name, name, IFNAMSIZ))
 625                         return dev;
 626
 627         return NULL;
 628 }
 629 EXPORT_SYMBOL(__dev_get_by_name);
 630
 631 /**
 632  *      dev_get_by_name_rcu     - find a device by its name
 633  *      @net: the applicable net namespace
 634  *      @name: name to find
 635  *
 636  *      Find an interface by name.
 637  *      If the name is found a pointer to the device is returned.
 638  *      If the name is not found then %NULL is returned.
 639  *      The reference counters are not incremented so the caller must be
 640  *      careful with locks. The caller must hold RCU lock.
 641  */
 642
 643 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 644 {
 645         struct hlist_node *p;
 646         struct net_device *dev;
 647         struct hlist_head *head = dev_name_hash(net, name);
 648
 649         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 650                 if (!strncmp(dev->name, name, IFNAMSIZ))
 651                         return dev;
 652
 653         return NULL;
 654 }
 655 EXPORT_SYMBOL(dev_get_by_name_rcu);
 656
 657 /**
 658  *      dev_get_by_name         - find a device by its name
 659  *      @net: the applicable net namespace
 660  *      @name: name to find
 661  *
 662  *      Find an interface by name. This can be called from any
 663  *      context and does its own locking. The returned handle has
 664  *      the usage count incremented and the caller must use dev_put() to
 665  *      release it when it is no longer needed. %NULL is returned if no
 666  *      matching device is found.
 667  */
 668
 669 struct net_device *dev_get_by_name(struct net *net, const char *name)
 670 {
 671         struct net_device *dev;
 672
 673         rcu_read_lock();
 674         dev = dev_get_by_name_rcu(net, name);
 675         if (dev)
 676                 dev_hold(dev);
 677         rcu_read_unlock();
 678         return dev;
 679 }
 680 EXPORT_SYMBOL(dev_get_by_name);
 681
 682 /**
 683  *      __dev_get_by_index - find a device by its ifindex
 684  *      @net: the applicable net namespace
 685  *      @ifindex: index of device
 686  *
 687  *      Search for an interface by index. Returns %NULL if the device
 688  *      is not found or a pointer to the device. The device has not
 689  *      had its reference counter increased so the caller must be careful
 690  *      about locking. The caller must hold either the RTNL semaphore
 691  *      or @dev_base_lock.
 692  */
 693
 694 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 695 {
 696         struct hlist_node *p;
 697         struct net_device *dev;
 698         struct hlist_head *head = dev_index_hash(net, ifindex);
 699
 700         hlist_for_each_entry(dev, p, head, index_hlist)
 701                 if (dev->ifindex == ifindex)
 702                         return dev;
 703
 704         return NULL;
 705 }
 706 EXPORT_SYMBOL(__dev_get_by_index);
 707
 708 /**
 709  *      dev_get_by_index_rcu - find a device by its ifindex
 710  *      @net: the applicable net namespace
 711  *      @ifindex: index of device
 712  *
 713  *      Search for an interface by index. Returns %NULL if the device
 714  *      is not found or a pointer to the device. The device has not
 715  *      had its reference counter increased so the caller must be careful
 716  *      about locking. The caller must hold RCU lock.
 717  */
 718
 719 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 720 {
 721         struct hlist_node *p;
 722         struct net_device *dev;
 723         struct hlist_head *head = dev_index_hash(net, ifindex);
 724
 725         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 726                 if (dev->ifindex == ifindex)
 727                         return dev;
 728
 729         return NULL;
 730 }
 731 EXPORT_SYMBOL(dev_get_by_index_rcu);
 732
 733
 734 /**
 735  *      dev_get_by_index - find a device by its ifindex
 736  *      @net: the applicable net namespace
 737  *      @ifindex: index of device
 738  *
 739  *      Search for an interface by index. Returns NULL if the device
 740  *      is not found or a pointer to the device. The device returned has
 741  *      had a reference added and the pointer is safe until the user calls
 742  *      dev_put to indicate they have finished with it.
 743  */
 744
 745 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 746 {
 747         struct net_device *dev;
 748
 749         rcu_read_lock();
 750         dev = dev_get_by_index_rcu(net, ifindex);
 751         if (dev)
 752                 dev_hold(dev);
 753         rcu_read_unlock();
 754         return dev;
 755 }
 756 EXPORT_SYMBOL(dev_get_by_index);
 757
 758 /**
 759  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 760  *      @net: the applicable net namespace
 761  *      @type: media type of device
 762  *      @ha: hardware address
 763  *
 764  *      Search for an interface by MAC address. Returns NULL if the device
 765  *      is not found or a pointer to the device.
 766  *      The caller must hold RCU or RTNL.
 767  *      The returned device has not had its ref count increased
 768  *      and the caller must therefore be careful about locking
 769  *
 770  */
 771
 772 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 773                                        const char *ha)
 774 {
 775         struct net_device *dev;
 776
 777         for_each_netdev_rcu(net, dev)
 778                 if (dev->type == type &&
 779                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 780                         return dev;
 781
 782         return NULL;
 783 }
 784 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 785
 786 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 787 {
 788         struct net_device *dev;
 789
 790         ASSERT_RTNL();
 791         for_each_netdev(net, dev)
 792                 if (dev->type == type)
 793                         return dev;
 794
 795         return NULL;
 796 }
 797 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 798
 799 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 800 {
 801         struct net_device *dev, *ret = NULL;
 802
 803         rcu_read_lock();
 804         for_each_netdev_rcu(net, dev)
 805                 if (dev->type == type) {
 806                         dev_hold(dev);
 807                         ret = dev;
 808                         break;
 809                 }
 810         rcu_read_unlock();
 811         return ret;
 812 }
 813 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 814
 815 /**
 816  *      dev_get_by_flags_rcu - find any device with given flags
 817  *      @net: the applicable net namespace
 818  *      @if_flags: IFF_* values
 819  *      @mask: bitmask of bits in if_flags to check
 820  *
 821  *      Search for any interface with the given flags. Returns NULL if a device
 822  *      is not found or a pointer to the device. Must be called inside
 823  *      rcu_read_lock(), and result refcount is unchanged.
 824  */
 825
 826 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 827                                     unsigned short mask)
 828 {
 829         struct net_device *dev, *ret;
 830
 831         ret = NULL;
 832         for_each_netdev_rcu(net, dev) {
 833                 if (((dev->flags ^ if_flags) & mask) == 0) {
 834                         ret = dev;
 835                         break;
 836                 }
 837         }
 838         return ret;
 839 }
 840 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 841
 842 /**
 843  *      dev_valid_name - check if name is okay for network device
 844  *      @name: name string
 845  *
 846  *      Network device names need to be valid file names to
 847  *      to allow sysfs to work.  We also disallow any kind of
 848  *      whitespace.
 849  */
 850 bool dev_valid_name(const char *name)
 851 {
 852         if (*name == '\0')
 853                 return false;
 854         if (strlen(name) >= IFNAMSIZ)
 855                 return false;
 856         if (!strcmp(name, ".") || !strcmp(name, ".."))
 857                 return false;
 858
 859         while (*name) {
 860                 if (*name == '/' || isspace(*name))
 861                         return false;
 862                 name++;
 863         }
 864         return true;
 865 }
 866 EXPORT_SYMBOL(dev_valid_name);
 867
 868 /**
 869  *      __dev_alloc_name - allocate a name for a device
 870  *      @net: network namespace to allocate the device name in
 871  *      @name: name format string
 872  *      @buf:  scratch buffer and result name string
 873  *
 874  *      Passed a format string - eg "lt%d" it will try and find a suitable
 875  *      id. It scans list of devices to build up a free map, then chooses
 876  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 877  *      while allocating the name and adding the device in order to avoid
 878  *      duplicates.
 879  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 880  *      Returns the number of the unit assigned or a negative errno code.
 881  */
 882
 883 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 884 {
 885         int i = 0;
 886         const char *p;
 887         const int max_netdevices = 8*PAGE_SIZE;
 888         unsigned long *inuse;
 889         struct net_device *d;
 890
 891         p = strnchr(name, IFNAMSIZ-1, '%');
 892         if (p) {
 893                 /*
 894                  * Verify the string as this thing may have come from
 895                  * the user.  There must be either one "%d" and no other "%"
 896                  * characters.
 897                  */
 898                 if (p[1] != 'd' || strchr(p + 2, '%'))
 899                         return -EINVAL;
 900
 901                 /* Use one page as a bit array of possible slots */
 902                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 903                 if (!inuse)
 904                         return -ENOMEM;
 905
 906                 for_each_netdev(net, d) {
 907                         if (!sscanf(d->name, name, &i))
 908                                 continue;
 909                         if (i < 0 || i >= max_netdevices)
 910                                 continue;
 911
 912                         /*  avoid cases where sscanf is not exact inverse of printf */
 913                         snprintf(buf, IFNAMSIZ, name, i);
 914                         if (!strncmp(buf, d->name, IFNAMSIZ))
 915                                 set_bit(i, inuse);
 916                 }
 917
 918                 i = find_first_zero_bit(inuse, max_netdevices);
 919                 free_page((unsigned long) inuse);
 920         }
 921
 922         if (buf != name)
 923                 snprintf(buf, IFNAMSIZ, name, i);
 924         if (!__dev_get_by_name(net, buf))
 925                 return i;
 926
 927         /* It is possible to run out of possible slots
 928          * when the name is long and there isn't enough space left
 929          * for the digits, or if all bits are used.
 930          */
 931         return -ENFILE;
 932 }
 933
 934 /**
 935  *      dev_alloc_name - allocate a name for a device
 936  *      @dev: device
 937  *      @name: name format string
 938  *
 939  *      Passed a format string - eg "lt%d" it will try and find a suitable
 940  *      id. It scans list of devices to build up a free map, then chooses
 941  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 942  *      while allocating the name and adding the device in order to avoid
 943  *      duplicates.
 944  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 945  *      Returns the number of the unit assigned or a negative errno code.
 946  */
 947
 948 int dev_alloc_name(struct net_device *dev, const char *name)
 949 {
 950         char buf[IFNAMSIZ];
 951         struct net *net;
 952         int ret;
 953
 954         BUG_ON(!dev_net(dev));
 955         net = dev_net(dev);
 956         ret = __dev_alloc_name(net, name, buf);
 957         if (ret >= 0)
 958                 strlcpy(dev->name, buf, IFNAMSIZ);
 959         return ret;
 960 }
 961 EXPORT_SYMBOL(dev_alloc_name);
 962
 963 static int dev_get_valid_name(struct net_device *dev, const char *name)
 964 {
 965         struct net *net;
 966
 967         BUG_ON(!dev_net(dev));
 968         net = dev_net(dev);
 969
 970         if (!dev_valid_name(name))
 971                 return -EINVAL;
 972
 973         if (strchr(name, '%'))
 974                 return dev_alloc_name(dev, name);
 975         else if (__dev_get_by_name(net, name))
 976                 return -EEXIST;
 977         else if (dev->name != name)
 978                 strlcpy(dev->name, name, IFNAMSIZ);
 979
 980         return 0;
 981 }
 982
 983 /**
 984  *      dev_change_name - change name of a device
 985  *      @dev: device
 986  *      @newname: name (or format string) must be at least IFNAMSIZ
 987  *
 988  *      Change name of a device, can pass format strings "eth%d".
 989  *      for wildcarding.
 990  */
 991 int dev_change_name(struct net_device *dev, const char *newname)
 992 {
 993         char oldname[IFNAMSIZ];
 994         int err = 0;
 995         int ret;
 996         struct net *net;
 997
 998         ASSERT_RTNL();
 999         BUG_ON(!dev_net(dev));
1000
1001         net = dev_net(dev);
1002         if (dev->flags & IFF_UP)
1003                 return -EBUSY;
1004
1005         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1006                 return 0;
1007
1008         memcpy(oldname, dev->name, IFNAMSIZ);
1009
1010         err = dev_get_valid_name(dev, newname);
1011         if (err < 0)
1012                 return err;
1013
1014 rollback:
1015         ret = device_rename(&dev->dev, dev->name);
1016         if (ret) {
1017                 memcpy(dev->name, oldname, IFNAMSIZ);
1018                 return ret;
1019         }
1020
1021         write_lock_bh(&dev_base_lock);
1022         hlist_del_rcu(&dev->name_hlist);
1023         write_unlock_bh(&dev_base_lock);
1024
1025         synchronize_rcu();
1026
1027         write_lock_bh(&dev_base_lock);
1028         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1029         write_unlock_bh(&dev_base_lock);
1030
1031         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1032         ret = notifier_to_errno(ret);
1033
1034         if (ret) {
1035                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1036                 if (err >= 0) {
1037                         err = ret;
1038                         memcpy(dev->name, oldname, IFNAMSIZ);
1039                         goto rollback;
1040                 } else {
1041                         pr_err("%s: name change rollback failed: %d\n",
1042                                dev->name, ret);
1043                 }
1044         }
1045
1046         return err;
1047 }
1048
1049 /**
1050  *      dev_set_alias - change ifalias of a device
1051  *      @dev: device
1052  *      @alias: name up to IFALIASZ
1053  *      @len: limit of bytes to copy from info
1054  *
1055  *      Set ifalias for a device,
1056  */
1057 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1058 {
1059         ASSERT_RTNL();
1060
1061         if (len >= IFALIASZ)
1062                 return -EINVAL;
1063
1064         if (!len) {
1065                 if (dev->ifalias) {
1066                         kfree(dev->ifalias);
1067                         dev->ifalias = NULL;
1068                 }
1069                 return 0;
1070         }
1071
1072         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1073         if (!dev->ifalias)
1074                 return -ENOMEM;
1075
1076         strlcpy(dev->ifalias, alias, len+1);
1077         return len;
1078 }
1079
1080
1081 /**
1082  *      netdev_features_change - device changes features
1083  *      @dev: device to cause notification
1084  *
1085  *      Called to indicate a device has changed features.
1086  */
1087 void netdev_features_change(struct net_device *dev)
1088 {
1089         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1090 }
1091 EXPORT_SYMBOL(netdev_features_change);
1092
1093 /**
1094  *      netdev_state_change - device changes state
1095  *      @dev: device to cause notification
1096  *
1097  *      Called to indicate a device has changed state. This function calls
1098  *      the notifier chains for netdev_chain and sends a NEWLINK message
1099  *      to the routing socket.
1100  */
1101 void netdev_state_change(struct net_device *dev)
1102 {
1103         if (dev->flags & IFF_UP) {
1104                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1105                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1106         }
1107 }
1108 EXPORT_SYMBOL(netdev_state_change);
1109
1110 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1111 {
1112         return call_netdevice_notifiers(event, dev);
1113 }
1114 EXPORT_SYMBOL(netdev_bonding_change);
1115
1116 /**
1117  *      dev_load        - load a network module
1118  *      @net: the applicable net namespace
1119  *      @name: name of interface
1120  *
1121  *      If a network interface is not present and the process has suitable
1122  *      privileges this function loads the module. If module loading is not
1123  *      available in this kernel then it becomes a nop.
1124  */
1125
1126 void dev_load(struct net *net, const char *name)
1127 {
1128         struct net_device *dev;
1129         int no_module;
1130
1131         rcu_read_lock();
1132         dev = dev_get_by_name_rcu(net, name);
1133         rcu_read_unlock();
1134
1135         no_module = !dev;
1136         if (no_module && capable(CAP_NET_ADMIN))
1137                 no_module = request_module("netdev-%s", name);
1138         if (no_module && capable(CAP_SYS_MODULE)) {
1139                 if (!request_module("%s", name))
1140                         pr_err("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1141                                name);
1142         }
1143 }
1144 EXPORT_SYMBOL(dev_load);
1145
1146 static int __dev_open(struct net_device *dev)
1147 {
1148         const struct net_device_ops *ops = dev->netdev_ops;
1149         int ret;
1150
1151         ASSERT_RTNL();
1152
1153         if (!netif_device_present(dev))
1154                 return -ENODEV;
1155
1156         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1157         ret = notifier_to_errno(ret);
1158         if (ret)
1159                 return ret;
1160
1161         set_bit(__LINK_STATE_START, &dev->state);
1162
1163         if (ops->ndo_validate_addr)
1164                 ret = ops->ndo_validate_addr(dev);
1165
1166         if (!ret && ops->ndo_open)
1167                 ret = ops->ndo_open(dev);
1168
1169         if (ret)
1170                 clear_bit(__LINK_STATE_START, &dev->state);
1171         else {
1172                 dev->flags |= IFF_UP;
1173                 net_dmaengine_get();
1174                 dev_set_rx_mode(dev);
1175                 dev_activate(dev);
1176         }
1177
1178         return ret;
1179 }
1180
1181 /**
1182  *      dev_open        - prepare an interface for use.
1183  *      @dev:   device to open
1184  *
1185  *      Takes a device from down to up state. The device's private open
1186  *      function is invoked and then the multicast lists are loaded. Finally
1187  *      the device is moved into the up state and a %NETDEV_UP message is
1188  *      sent to the netdev notifier chain.
1189  *
1190  *      Calling this function on an active interface is a nop. On a failure
1191  *      a negative errno code is returned.
1192  */
1193 int dev_open(struct net_device *dev)
1194 {
1195         int ret;
1196
1197         if (dev->flags & IFF_UP)
1198                 return 0;
1199
1200         ret = __dev_open(dev);
1201         if (ret < 0)
1202                 return ret;
1203
1204         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1205         call_netdevice_notifiers(NETDEV_UP, dev);
1206
1207         return ret;
1208 }
1209 EXPORT_SYMBOL(dev_open);
1210
1211 static int __dev_close_many(struct list_head *head)
1212 {
1213         struct net_device *dev;
1214
1215         ASSERT_RTNL();
1216         might_sleep();
1217
1218         list_for_each_entry(dev, head, unreg_list) {
1219                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1220
1221                 clear_bit(__LINK_STATE_START, &dev->state);
1222
1223                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1224                  * can be even on different cpu. So just clear netif_running().
1225                  *
1226                  * dev->stop() will invoke napi_disable() on all of it's
1227                  * napi_struct instances on this device.
1228                  */
1229                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1230         }
1231
1232         dev_deactivate_many(head);
1233
1234         list_for_each_entry(dev, head, unreg_list) {
1235                 const struct net_device_ops *ops = dev->netdev_ops;
1236
1237                 /*
1238                  *      Call the device specific close. This cannot fail.
1239                  *      Only if device is UP
1240                  *
1241                  *      We allow it to be called even after a DETACH hot-plug
1242                  *      event.
1243                  */
1244                 if (ops->ndo_stop)
1245                         ops->ndo_stop(dev);
1246
1247                 dev->flags &= ~IFF_UP;
1248                 net_dmaengine_put();
1249         }
1250
1251         return 0;
1252 }
1253
1254 static int __dev_close(struct net_device *dev)
1255 {
1256         int retval;
1257         LIST_HEAD(single);
1258
1259         list_add(&dev->unreg_list, &single);
1260         retval = __dev_close_many(&single);
1261         list_del(&single);
1262         return retval;
1263 }
1264
1265 static int dev_close_many(struct list_head *head)
1266 {
1267         struct net_device *dev, *tmp;
1268         LIST_HEAD(tmp_list);
1269
1270         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1271                 if (!(dev->flags & IFF_UP))
1272                         list_move(&dev->unreg_list, &tmp_list);
1273
1274         __dev_close_many(head);
1275
1276         list_for_each_entry(dev, head, unreg_list) {
1277                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1278                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1279         }
1280
1281         /* rollback_registered_many needs the complete original list */
1282         list_splice(&tmp_list, head);
1283         return 0;
1284 }
1285
1286 /**
1287  *      dev_close - shutdown an interface.
1288  *      @dev: device to shutdown
1289  *
1290  *      This function moves an active device into down state. A
1291  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1292  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1293  *      chain.
1294  */
1295 int dev_close(struct net_device *dev)
1296 {
1297         if (dev->flags & IFF_UP) {
1298                 LIST_HEAD(single);
1299
1300                 list_add(&dev->unreg_list, &single);
1301                 dev_close_many(&single);
1302                 list_del(&single);
1303         }
1304         return 0;
1305 }
1306 EXPORT_SYMBOL(dev_close);
1307
1308
1309 /**
1310  *      dev_disable_lro - disable Large Receive Offload on a device
1311  *      @dev: device
1312  *
1313  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1314  *      called under RTNL.  This is needed if received packets may be
1315  *      forwarded to another interface.
1316  */
1317 void dev_disable_lro(struct net_device *dev)
1318 {
1319         /*
1320          * If we're trying to disable lro on a vlan device
1321          * use the underlying physical device instead
1322          */
1323         if (is_vlan_dev(dev))
1324                 dev = vlan_dev_real_dev(dev);
1325
1326         dev->wanted_features &= ~NETIF_F_LRO;
1327         netdev_update_features(dev);
1328
1329         if (unlikely(dev->features & NETIF_F_LRO))
1330                 netdev_WARN(dev, "failed to disable LRO!\n");
1331 }
1332 EXPORT_SYMBOL(dev_disable_lro);
1333
1334
1335 static int dev_boot_phase = 1;
1336
1337 /**
1338  *      register_netdevice_notifier - register a network notifier block
1339  *      @nb: notifier
1340  *
1341  *      Register a notifier to be called when network device events occur.
1342  *      The notifier passed is linked into the kernel structures and must
1343  *      not be reused until it has been unregistered. A negative errno code
1344  *      is returned on a failure.
1345  *
1346  *      When registered all registration and up events are replayed
1347  *      to the new notifier to allow device to have a race free
1348  *      view of the network device list.
1349  */
1350
1351 int register_netdevice_notifier(struct notifier_block *nb)
1352 {
1353         struct net_device *dev;
1354         struct net_device *last;
1355         struct net *net;
1356         int err;
1357
1358         rtnl_lock();
1359         err = raw_notifier_chain_register(&netdev_chain, nb);
1360         if (err)
1361                 goto unlock;
1362         if (dev_boot_phase)
1363                 goto unlock;
1364         for_each_net(net) {
1365                 for_each_netdev(net, dev) {
1366                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1367                         err = notifier_to_errno(err);
1368                         if (err)
1369                                 goto rollback;
1370
1371                         if (!(dev->flags & IFF_UP))
1372                                 continue;
1373
1374                         nb->notifier_call(nb, NETDEV_UP, dev);
1375                 }
1376         }
1377
1378 unlock:
1379         rtnl_unlock();
1380         return err;
1381
1382 rollback:
1383         last = dev;
1384         for_each_net(net) {
1385                 for_each_netdev(net, dev) {
1386                         if (dev == last)
1387                                 goto outroll;
1388
1389                         if (dev->flags & IFF_UP) {
1390                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1391                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1392                         }
1393                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1394                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1395                 }
1396         }
1397
1398 outroll:
1399         raw_notifier_chain_unregister(&netdev_chain, nb);
1400         goto unlock;
1401 }
1402 EXPORT_SYMBOL(register_netdevice_notifier);
1403
1404 /**
1405  *      unregister_netdevice_notifier - unregister a network notifier block
1406  *      @nb: notifier
1407  *
1408  *      Unregister a notifier previously registered by
1409  *      register_netdevice_notifier(). The notifier is unlinked into the
1410  *      kernel structures and may then be reused. A negative errno code
1411  *      is returned on a failure.
1412  *
1413  *      After unregistering unregister and down device events are synthesized
1414  *      for all devices on the device list to the removed notifier to remove
1415  *      the need for special case cleanup code.
1416  */
1417
1418 int unregister_netdevice_notifier(struct notifier_block *nb)
1419 {
1420         struct net_device *dev;
1421         struct net *net;
1422         int err;
1423
1424         rtnl_lock();
1425         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1426         if (err)
1427                 goto unlock;
1428
1429         for_each_net(net) {
1430                 for_each_netdev(net, dev) {
1431                         if (dev->flags & IFF_UP) {
1432                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1433                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1434                         }
1435                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1436                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1437                 }
1438         }
1439 unlock:
1440         rtnl_unlock();
1441         return err;
1442 }
1443 EXPORT_SYMBOL(unregister_netdevice_notifier);
1444
1445 /**
1446  *      call_netdevice_notifiers - call all network notifier blocks
1447  *      @val: value passed unmodified to notifier function
1448  *      @dev: net_device pointer passed unmodified to notifier function
1449  *
1450  *      Call all network notifier blocks.  Parameters and return value
1451  *      are as for raw_notifier_call_chain().
1452  */
1453
1454 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1455 {
1456         ASSERT_RTNL();
1457         return raw_notifier_call_chain(&netdev_chain, val, dev);
1458 }
1459 EXPORT_SYMBOL(call_netdevice_notifiers);
1460
1461 static struct static_key netstamp_needed __read_mostly;
1462 #ifdef HAVE_JUMP_LABEL
1463 /* We are not allowed to call static_key_slow_dec() from irq context
1464  * If net_disable_timestamp() is called from irq context, defer the
1465  * static_key_slow_dec() calls.
1466  */
1467 static atomic_t netstamp_needed_deferred;
1468 #endif
1469
1470 void net_enable_timestamp(void)
1471 {
1472 #ifdef HAVE_JUMP_LABEL
1473         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1474
1475         if (deferred) {
1476                 while (--deferred)
1477                         static_key_slow_dec(&netstamp_needed);
1478                 return;
1479         }
1480 #endif
1481         WARN_ON(in_interrupt());
1482         static_key_slow_inc(&netstamp_needed);
1483 }
1484 EXPORT_SYMBOL(net_enable_timestamp);
1485
1486 void net_disable_timestamp(void)
1487 {
1488 #ifdef HAVE_JUMP_LABEL
1489         if (in_interrupt()) {
1490                 atomic_inc(&netstamp_needed_deferred);
1491                 return;
1492         }
1493 #endif
1494         static_key_slow_dec(&netstamp_needed);
1495 }
1496 EXPORT_SYMBOL(net_disable_timestamp);
1497
1498 static inline void net_timestamp_set(struct sk_buff *skb)
1499 {
1500         skb->tstamp.tv64 = 0;
1501         if (static_key_false(&netstamp_needed))
1502                 __net_timestamp(skb);
1503 }
1504
1505 #define net_timestamp_check(COND, SKB)                  \
1506         if (static_key_false(&netstamp_needed)) {               \
1507                 if ((COND) && !(SKB)->tstamp.tv64)      \
1508                         __net_timestamp(SKB);           \
1509         }                                               \
1510
1511 static int net_hwtstamp_validate(struct ifreq *ifr)
1512 {
1513         struct hwtstamp_config cfg;
1514         enum hwtstamp_tx_types tx_type;
1515         enum hwtstamp_rx_filters rx_filter;
1516         int tx_type_valid = 0;
1517         int rx_filter_valid = 0;
1518
1519         if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1520                 return -EFAULT;
1521
1522         if (cfg.flags) /* reserved for future extensions */
1523                 return -EINVAL;
1524
1525         tx_type = cfg.tx_type;
1526         rx_filter = cfg.rx_filter;
1527
1528         switch (tx_type) {
1529         case HWTSTAMP_TX_OFF:
1530         case HWTSTAMP_TX_ON:
1531         case HWTSTAMP_TX_ONESTEP_SYNC:
1532                 tx_type_valid = 1;
1533                 break;
1534         }
1535
1536         switch (rx_filter) {
1537         case HWTSTAMP_FILTER_NONE:
1538         case HWTSTAMP_FILTER_ALL:
1539         case HWTSTAMP_FILTER_SOME:
1540         case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1541         case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1542         case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1543         case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1544         case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1545         case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1546         case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1547         case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1548         case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1549         case HWTSTAMP_FILTER_PTP_V2_EVENT:
1550         case HWTSTAMP_FILTER_PTP_V2_SYNC:
1551         case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1552                 rx_filter_valid = 1;
1553                 break;
1554         }
1555
1556         if (!tx_type_valid || !rx_filter_valid)
1557                 return -ERANGE;
1558
1559         return 0;
1560 }
1561
1562 static inline bool is_skb_forwardable(struct net_device *dev,
1563                                       struct sk_buff *skb)
1564 {
1565         unsigned int len;
1566
1567         if (!(dev->flags & IFF_UP))
1568                 return false;
1569
1570         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1571         if (skb->len <= len)
1572                 return true;
1573
1574         /* if TSO is enabled, we don't care about the length as the packet
1575          * could be forwarded without being segmented before
1576          */
1577         if (skb_is_gso(skb))
1578                 return true;
1579
1580         return false;
1581 }
1582
1583 /**
1584  * dev_forward_skb - loopback an skb to another netif
1585  *
1586  * @dev: destination network device
1587  * @skb: buffer to forward
1588  *
1589  * return values:
1590  *      NET_RX_SUCCESS  (no congestion)
1591  *      NET_RX_DROP     (packet was dropped, but freed)
1592  *
1593  * dev_forward_skb can be used for injecting an skb from the
1594  * start_xmit function of one device into the receive queue
1595  * of another device.
1596  *
1597  * The receiving device may be in another namespace, so
1598  * we have to clear all information in the skb that could
1599  * impact namespace isolation.
1600  */
1601 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1602 {
1603         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1604                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1605                         atomic_long_inc(&dev->rx_dropped);
1606                         kfree_skb(skb);
1607                         return NET_RX_DROP;
1608                 }
1609         }
1610
1611         skb_orphan(skb);
1612         nf_reset(skb);
1613
1614         if (unlikely(!is_skb_forwardable(dev, skb))) {
1615                 atomic_long_inc(&dev->rx_dropped);
1616                 kfree_skb(skb);
1617                 return NET_RX_DROP;
1618         }
1619         skb->skb_iif = 0;
1620         skb->dev = dev;
1621         skb_dst_drop(skb);
1622         skb->tstamp.tv64 = 0;
1623         skb->pkt_type = PACKET_HOST;
1624         skb->protocol = eth_type_trans(skb, dev);
1625         skb->mark = 0;
1626         secpath_reset(skb);
1627         nf_reset(skb);
1628         return netif_rx(skb);
1629 }
1630 EXPORT_SYMBOL_GPL(dev_forward_skb);
1631
1632 static inline int deliver_skb(struct sk_buff *skb,
1633                               struct packet_type *pt_prev,
1634                               struct net_device *orig_dev)
1635 {
1636         atomic_inc(&skb->users);
1637         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1638 }
1639
1640 /*
1641  *      Support routine. Sends outgoing frames to any network
1642  *      taps currently in use.
1643  */
1644
1645 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1646 {
1647         struct packet_type *ptype;
1648         struct sk_buff *skb2 = NULL;
1649         struct packet_type *pt_prev = NULL;
1650
1651         rcu_read_lock();
1652         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1653                 /* Never send packets back to the socket
1654                  * they originated from - MvS (miquels@drinkel.ow.org)
1655                  */
1656                 if ((ptype->dev == dev || !ptype->dev) &&
1657                     (ptype->af_packet_priv == NULL ||
1658                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1659                         if (pt_prev) {
1660                                 deliver_skb(skb2, pt_prev, skb->dev);
1661                                 pt_prev = ptype;
1662                                 continue;
1663                         }
1664
1665                         skb2 = skb_clone(skb, GFP_ATOMIC);
1666                         if (!skb2)
1667                                 break;
1668
1669                         net_timestamp_set(skb2);
1670
1671                         /* skb->nh should be correctly
1672                            set by sender, so that the second statement is
1673                            just protection against buggy protocols.
1674                          */
1675                         skb_reset_mac_header(skb2);
1676
1677                         if (skb_network_header(skb2) < skb2->data ||
1678                             skb2->network_header > skb2->tail) {
1679                                 if (net_ratelimit())
1680                                         pr_crit("protocol %04x is buggy, dev %s\n",
1681                                                 ntohs(skb2->protocol),
1682                                                 dev->name);
1683                                 skb_reset_network_header(skb2);
1684                         }
1685
1686                         skb2->transport_header = skb2->network_header;
1687                         skb2->pkt_type = PACKET_OUTGOING;
1688                         pt_prev = ptype;
1689                 }
1690         }
1691         if (pt_prev)
1692                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1693         rcu_read_unlock();
1694 }
1695
1696 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1697  * @dev: Network device
1698  * @txq: number of queues available
1699  *
1700  * If real_num_tx_queues is changed the tc mappings may no longer be
1701  * valid. To resolve this verify the tc mapping remains valid and if
1702  * not NULL the mapping. With no priorities mapping to this
1703  * offset/count pair it will no longer be used. In the worst case TC0
1704  * is invalid nothing can be done so disable priority mappings. If is
1705  * expected that drivers will fix this mapping if they can before
1706  * calling netif_set_real_num_tx_queues.
1707  */
1708 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1709 {
1710         int i;
1711         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1712
1713         /* If TC0 is invalidated disable TC mapping */
1714         if (tc->offset + tc->count > txq) {
1715                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1716                 dev->num_tc = 0;
1717                 return;
1718         }
1719
1720         /* Invalidated prio to tc mappings set to TC0 */
1721         for (i = 1; i < TC_BITMASK + 1; i++) {
1722                 int q = netdev_get_prio_tc_map(dev, i);
1723
1724                 tc = &dev->tc_to_txq[q];
1725                 if (tc->offset + tc->count > txq) {
1726                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1727                                 i, q);
1728                         netdev_set_prio_tc_map(dev, i, 0);
1729                 }
1730         }
1731 }
1732
1733 /*
1734  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1735  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1736  */
1737 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1738 {
1739         int rc;
1740
1741         if (txq < 1 || txq > dev->num_tx_queues)
1742                 return -EINVAL;
1743
1744         if (dev->reg_state == NETREG_REGISTERED ||
1745             dev->reg_state == NETREG_UNREGISTERING) {
1746                 ASSERT_RTNL();
1747
1748                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1749                                                   txq);
1750                 if (rc)
1751                         return rc;
1752
1753                 if (dev->num_tc)
1754                         netif_setup_tc(dev, txq);
1755
1756                 if (txq < dev->real_num_tx_queues)
1757                         qdisc_reset_all_tx_gt(dev, txq);
1758         }
1759
1760         dev->real_num_tx_queues = txq;
1761         return 0;
1762 }
1763 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1764
1765 #ifdef CONFIG_RPS
1766 /**
1767  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1768  *      @dev: Network device
1769  *      @rxq: Actual number of RX queues
1770  *
1771  *      This must be called either with the rtnl_lock held or before
1772  *      registration of the net device.  Returns 0 on success, or a
1773  *      negative error code.  If called before registration, it always
1774  *      succeeds.
1775  */
1776 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1777 {
1778         int rc;
1779
1780         if (rxq < 1 || rxq > dev->num_rx_queues)
1781                 return -EINVAL;
1782
1783         if (dev->reg_state == NETREG_REGISTERED) {
1784                 ASSERT_RTNL();
1785
1786                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1787                                                   rxq);
1788                 if (rc)
1789                         return rc;
1790         }
1791
1792         dev->real_num_rx_queues = rxq;
1793         return 0;
1794 }
1795 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1796 #endif
1797
1798 static inline void __netif_reschedule(struct Qdisc *q)
1799 {
1800         struct softnet_data *sd;
1801         unsigned long flags;
1802
1803         local_irq_save(flags);
1804         sd = &__get_cpu_var(softnet_data);
1805         q->next_sched = NULL;
1806         *sd->output_queue_tailp = q;
1807         sd->output_queue_tailp = &q->next_sched;
1808         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1809         local_irq_restore(flags);
1810 }
1811
1812 void __netif_schedule(struct Qdisc *q)
1813 {
1814         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1815                 __netif_reschedule(q);
1816 }
1817 EXPORT_SYMBOL(__netif_schedule);
1818
1819 void dev_kfree_skb_irq(struct sk_buff *skb)
1820 {
1821         if (atomic_dec_and_test(&skb->users)) {
1822                 struct softnet_data *sd;
1823                 unsigned long flags;
1824
1825                 local_irq_save(flags);
1826                 sd = &__get_cpu_var(softnet_data);
1827                 skb->next = sd->completion_queue;
1828                 sd->completion_queue = skb;
1829                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1830                 local_irq_restore(flags);
1831         }
1832 }
1833 EXPORT_SYMBOL(dev_kfree_skb_irq);
1834
1835 void dev_kfree_skb_any(struct sk_buff *skb)
1836 {
1837         if (in_irq() || irqs_disabled())
1838                 dev_kfree_skb_irq(skb);
1839         else
1840                 dev_kfree_skb(skb);
1841 }
1842 EXPORT_SYMBOL(dev_kfree_skb_any);
1843
1844
1845 /**
1846  * netif_device_detach - mark device as removed
1847  * @dev: network device
1848  *
1849  * Mark device as removed from system and therefore no longer available.
1850  */
1851 void netif_device_detach(struct net_device *dev)
1852 {
1853         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1854             netif_running(dev)) {
1855                 netif_tx_stop_all_queues(dev);
1856         }
1857 }
1858 EXPORT_SYMBOL(netif_device_detach);
1859
1860 /**
1861  * netif_device_attach - mark device as attached
1862  * @dev: network device
1863  *
1864  * Mark device as attached from system and restart if needed.
1865  */
1866 void netif_device_attach(struct net_device *dev)
1867 {
1868         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1869             netif_running(dev)) {
1870                 netif_tx_wake_all_queues(dev);
1871                 __netdev_watchdog_up(dev);
1872         }
1873 }
1874 EXPORT_SYMBOL(netif_device_attach);
1875
1876 static void skb_warn_bad_offload(const struct sk_buff *skb)
1877 {
1878         static const netdev_features_t null_features = 0;
1879         struct net_device *dev = skb->dev;
1880         const char *driver = "";
1881
1882         if (dev && dev->dev.parent)
1883                 driver = dev_driver_string(dev->dev.parent);
1884
1885         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1886              "gso_type=%d ip_summed=%d\n",
1887              driver, dev ? &dev->features : &null_features,
1888              skb->sk ? &skb->sk->sk_route_caps : &null_features,
1889              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1890              skb_shinfo(skb)->gso_type, skb->ip_summed);
1891 }
1892
1893 /*
1894  * Invalidate hardware checksum when packet is to be mangled, and
1895  * complete checksum manually on outgoing path.
1896  */
1897 int skb_checksum_help(struct sk_buff *skb)
1898 {
1899         __wsum csum;
1900         int ret = 0, offset;
1901
1902         if (skb->ip_summed == CHECKSUM_COMPLETE)
1903                 goto out_set_summed;
1904
1905         if (unlikely(skb_shinfo(skb)->gso_size)) {
1906                 skb_warn_bad_offload(skb);
1907                 return -EINVAL;
1908         }
1909
1910         offset = skb_checksum_start_offset(skb);
1911         BUG_ON(offset >= skb_headlen(skb));
1912         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1913
1914         offset += skb->csum_offset;
1915         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1916
1917         if (skb_cloned(skb) &&
1918             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1919                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1920                 if (ret)
1921                         goto out;
1922         }
1923
1924         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1925 out_set_summed:
1926         skb->ip_summed = CHECKSUM_NONE;
1927 out:
1928         return ret;
1929 }
1930 EXPORT_SYMBOL(skb_checksum_help);
1931
1932 /**
1933  *      skb_gso_segment - Perform segmentation on skb.
1934  *      @skb: buffer to segment
1935  *      @features: features for the output path (see dev->features)
1936  *
1937  *      This function segments the given skb and returns a list of segments.
1938  *
1939  *      It may return NULL if the skb requires no segmentation.  This is
1940  *      only possible when GSO is used for verifying header integrity.
1941  */
1942 struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1943         netdev_features_t features)
1944 {
1945         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1946         struct packet_type *ptype;
1947         __be16 type = skb->protocol;
1948         int vlan_depth = ETH_HLEN;
1949         int err;
1950
1951         while (type == htons(ETH_P_8021Q)) {
1952                 struct vlan_hdr *vh;
1953
1954                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1955                         return ERR_PTR(-EINVAL);
1956
1957                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1958                 type = vh->h_vlan_encapsulated_proto;
1959                 vlan_depth += VLAN_HLEN;
1960         }
1961
1962         skb_reset_mac_header(skb);
1963         skb->mac_len = skb->network_header - skb->mac_header;
1964         __skb_pull(skb, skb->mac_len);
1965
1966         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1967                 skb_warn_bad_offload(skb);
1968
1969                 if (skb_header_cloned(skb) &&
1970                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1971                         return ERR_PTR(err);
1972         }
1973
1974         rcu_read_lock();
1975         list_for_each_entry_rcu(ptype,
1976                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1977                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1978                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1979                                 err = ptype->gso_send_check(skb);
1980                                 segs = ERR_PTR(err);
1981                                 if (err || skb_gso_ok(skb, features))
1982                                         break;
1983                                 __skb_push(skb, (skb->data -
1984                                                  skb_network_header(skb)));
1985                         }
1986                         segs = ptype->gso_segment(skb, features);
1987                         break;
1988                 }
1989         }
1990         rcu_read_unlock();
1991
1992         __skb_push(skb, skb->data - skb_mac_header(skb));
1993
1994         return segs;
1995 }
1996 EXPORT_SYMBOL(skb_gso_segment);
1997
1998 /* Take action when hardware reception checksum errors are detected. */
1999 #ifdef CONFIG_BUG
2000 void netdev_rx_csum_fault(struct net_device *dev)
2001 {
2002         if (net_ratelimit()) {
2003                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2004                 dump_stack();
2005         }
2006 }
2007 EXPORT_SYMBOL(netdev_rx_csum_fault);
2008 #endif
2009
2010 /* Actually, we should eliminate this check as soon as we know, that:
2011  * 1. IOMMU is present and allows to map all the memory.
2012  * 2. No high memory really exists on this machine.
2013  */
2014
2015 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2016 {
2017 #ifdef CONFIG_HIGHMEM
2018         int i;
2019         if (!(dev->features & NETIF_F_HIGHDMA)) {
2020                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2021                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2022                         if (PageHighMem(skb_frag_page(frag)))
2023                                 return 1;
2024                 }
2025         }
2026
2027         if (PCI_DMA_BUS_IS_PHYS) {
2028                 struct device *pdev = dev->dev.parent;
2029
2030                 if (!pdev)
2031                         return 0;
2032                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2033                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2034                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2035                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2036                                 return 1;
2037                 }
2038         }
2039 #endif
2040         return 0;
2041 }
2042
2043 struct dev_gso_cb {
2044         void (*destructor)(struct sk_buff *skb);
2045 };
2046
2047 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2048
2049 static void dev_gso_skb_destructor(struct sk_buff *skb)
2050 {
2051         struct dev_gso_cb *cb;
2052
2053         do {
2054                 struct sk_buff *nskb = skb->next;
2055
2056                 skb->next = nskb->next;
2057                 nskb->next = NULL;
2058                 kfree_skb(nskb);
2059         } while (skb->next);
2060
2061         cb = DEV_GSO_CB(skb);
2062         if (cb->destructor)
2063                 cb->destructor(skb);
2064 }
2065
2066 /**
2067  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2068  *      @skb: buffer to segment
2069  *      @features: device features as applicable to this skb
2070  *
2071  *      This function segments the given skb and stores the list of segments
2072  *      in skb->next.
2073  */
2074 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2075 {
2076         struct sk_buff *segs;
2077
2078         segs = skb_gso_segment(skb, features);
2079
2080         /* Verifying header integrity only. */
2081         if (!segs)
2082                 return 0;
2083
2084         if (IS_ERR(segs))
2085                 return PTR_ERR(segs);
2086
2087         skb->next = segs;
2088         DEV_GSO_CB(skb)->destructor = skb->destructor;
2089         skb->destructor = dev_gso_skb_destructor;
2090
2091         return 0;
2092 }
2093
2094 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2095 {
2096         return ((features & NETIF_F_GEN_CSUM) ||
2097                 ((features & NETIF_F_V4_CSUM) &&
2098                  protocol == htons(ETH_P_IP)) ||
2099                 ((features & NETIF_F_V6_CSUM) &&
2100                  protocol == htons(ETH_P_IPV6)) ||
2101                 ((features & NETIF_F_FCOE_CRC) &&
2102                  protocol == htons(ETH_P_FCOE)));
2103 }
2104
2105 static netdev_features_t harmonize_features(struct sk_buff *skb,
2106         __be16 protocol, netdev_features_t features)
2107 {
2108         if (!can_checksum_protocol(features, protocol)) {
2109                 features &= ~NETIF_F_ALL_CSUM;
2110                 features &= ~NETIF_F_SG;
2111         } else if (illegal_highdma(skb->dev, skb)) {
2112                 features &= ~NETIF_F_SG;
2113         }
2114
2115         return features;
2116 }
2117
2118 netdev_features_t netif_skb_features(struct sk_buff *skb)
2119 {
2120         __be16 protocol = skb->protocol;
2121         netdev_features_t features = skb->dev->features;
2122
2123         if (protocol == htons(ETH_P_8021Q)) {
2124                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2125                 protocol = veh->h_vlan_encapsulated_proto;
2126         } else if (!vlan_tx_tag_present(skb)) {
2127                 return harmonize_features(skb, protocol, features);
2128         }
2129
2130         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2131
2132         if (protocol != htons(ETH_P_8021Q)) {
2133                 return harmonize_features(skb, protocol, features);
2134         } else {
2135                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2136                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2137                 return harmonize_features(skb, protocol, features);
2138         }
2139 }
2140 EXPORT_SYMBOL(netif_skb_features);
2141
2142 /*
2143  * Returns true if either:
2144  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2145  *      2. skb is fragmented and the device does not support SG, or if
2146  *         at least one of fragments is in highmem and device does not
2147  *         support DMA from it.
2148  */
2149 static inline int skb_needs_linearize(struct sk_buff *skb,
2150                                       int features)
2151 {
2152         return skb_is_nonlinear(skb) &&
2153                         ((skb_has_frag_list(skb) &&
2154                                 !(features & NETIF_F_FRAGLIST)) ||
2155                         (skb_shinfo(skb)->nr_frags &&
2156                                 !(features & NETIF_F_SG)));
2157 }
2158
2159 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2160                         struct netdev_queue *txq)
2161 {
2162         const struct net_device_ops *ops = dev->netdev_ops;
2163         int rc = NETDEV_TX_OK;
2164         unsigned int skb_len;
2165
2166         if (likely(!skb->next)) {
2167                 netdev_features_t features;
2168
2169                 /*
2170                  * If device doesn't need skb->dst, release it right now while
2171                  * its hot in this cpu cache
2172                  */
2173                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2174                         skb_dst_drop(skb);
2175
2176                 if (!list_empty(&ptype_all))
2177                         dev_queue_xmit_nit(skb, dev);
2178
2179                 features = netif_skb_features(skb);
2180
2181                 if (vlan_tx_tag_present(skb) &&
2182                     !(features & NETIF_F_HW_VLAN_TX)) {
2183                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2184                         if (unlikely(!skb))
2185                                 goto out;
2186
2187                         skb->vlan_tci = 0;
2188                 }
2189
2190                 if (netif_needs_gso(skb, features)) {
2191                         if (unlikely(dev_gso_segment(skb, features)))
2192                                 goto out_kfree_skb;
2193                         if (skb->next)
2194                                 goto gso;
2195                 } else {
2196                         if (skb_needs_linearize(skb, features) &&
2197                             __skb_linearize(skb))
2198                                 goto out_kfree_skb;
2199
2200                         /* If packet is not checksummed and device does not
2201                          * support checksumming for this protocol, complete
2202                          * checksumming here.
2203                          */
2204                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2205                                 skb_set_transport_header(skb,
2206                                         skb_checksum_start_offset(skb));
2207                                 if (!(features & NETIF_F_ALL_CSUM) &&
2208                                      skb_checksum_help(skb))
2209                                         goto out_kfree_skb;
2210                         }
2211                 }
2212
2213                 skb_len = skb->len;
2214                 rc = ops->ndo_start_xmit(skb, dev);
2215                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2216                 if (rc == NETDEV_TX_OK)
2217                         txq_trans_update(txq);
2218                 return rc;
2219         }
2220
2221 gso:
2222         do {
2223                 struct sk_buff *nskb = skb->next;
2224
2225                 skb->next = nskb->next;
2226                 nskb->next = NULL;
2227
2228                 /*
2229                  * If device doesn't need nskb->dst, release it right now while
2230                  * its hot in this cpu cache
2231                  */
2232                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2233                         skb_dst_drop(nskb);
2234
2235                 skb_len = nskb->len;
2236                 rc = ops->ndo_start_xmit(nskb, dev);
2237                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2238                 if (unlikely(rc != NETDEV_TX_OK)) {
2239                         if (rc & ~NETDEV_TX_MASK)
2240                                 goto out_kfree_gso_skb;
2241                         nskb->next = skb->next;
2242                         skb->next = nskb;
2243                         return rc;
2244                 }
2245                 txq_trans_update(txq);
2246                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2247                         return NETDEV_TX_BUSY;
2248         } while (skb->next);
2249
2250 out_kfree_gso_skb:
2251         if (likely(skb->next == NULL))
2252                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2253 out_kfree_skb:
2254         kfree_skb(skb);
2255 out:
2256         return rc;
2257 }
2258
2259 static u32 hashrnd __read_mostly;
2260
2261 /*
2262  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2263  * to be used as a distribution range.
2264  */
2265 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2266                   unsigned int num_tx_queues)
2267 {
2268         u32 hash;
2269         u16 qoffset = 0;
2270         u16 qcount = num_tx_queues;
2271
2272         if (skb_rx_queue_recorded(skb)) {
2273                 hash = skb_get_rx_queue(skb);
2274                 while (unlikely(hash >= num_tx_queues))
2275                         hash -= num_tx_queues;
2276                 return hash;
2277         }
2278
2279         if (dev->num_tc) {
2280                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2281                 qoffset = dev->tc_to_txq[tc].offset;
2282                 qcount = dev->tc_to_txq[tc].count;
2283         }
2284
2285         if (skb->sk && skb->sk->sk_hash)
2286                 hash = skb->sk->sk_hash;
2287         else
2288                 hash = (__force u16) skb->protocol;
2289         hash = jhash_1word(hash, hashrnd);
2290
2291         return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2292 }
2293 EXPORT_SYMBOL(__skb_tx_hash);
2294
2295 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2296 {
2297         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2298                 if (net_ratelimit()) {
2299                         pr_warn("%s selects TX queue %d, but real number of TX queues is %d\n",
2300                                 dev->name, queue_index,
2301                                 dev->real_num_tx_queues);
2302                 }
2303                 return 0;
2304         }
2305         return queue_index;
2306 }
2307
2308 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2309 {
2310 #ifdef CONFIG_XPS
2311         struct xps_dev_maps *dev_maps;
2312         struct xps_map *map;
2313         int queue_index = -1;
2314
2315         rcu_read_lock();
2316         dev_maps = rcu_dereference(dev->xps_maps);
2317         if (dev_maps) {
2318                 map = rcu_dereference(
2319                     dev_maps->cpu_map[raw_smp_processor_id()]);
2320                 if (map) {
2321                         if (map->len == 1)
2322                                 queue_index = map->queues[0];
2323                         else {
2324                                 u32 hash;
2325                                 if (skb->sk && skb->sk->sk_hash)
2326                                         hash = skb->sk->sk_hash;
2327                                 else
2328                                         hash = (__force u16) skb->protocol ^
2329                                             skb->rxhash;
2330                                 hash = jhash_1word(hash, hashrnd);
2331                                 queue_index = map->queues[
2332                                     ((u64)hash * map->len) >> 32];
2333                         }
2334                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2335                                 queue_index = -1;
2336                 }
2337         }
2338         rcu_read_unlock();
2339
2340         return queue_index;
2341 #else
2342         return -1;
2343 #endif
2344 }
2345
2346 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2347                                         struct sk_buff *skb)
2348 {
2349         int queue_index;
2350         const struct net_device_ops *ops = dev->netdev_ops;
2351
2352         if (dev->real_num_tx_queues == 1)
2353                 queue_index = 0;
2354         else if (ops->ndo_select_queue) {
2355                 queue_index = ops->ndo_select_queue(dev, skb);
2356                 queue_index = dev_cap_txqueue(dev, queue_index);
2357         } else {
2358                 struct sock *sk = skb->sk;
2359                 queue_index = sk_tx_queue_get(sk);
2360
2361                 if (queue_index < 0 || skb->ooo_okay ||
2362                     queue_index >= dev->real_num_tx_queues) {
2363                         int old_index = queue_index;
2364
2365                         queue_index = get_xps_queue(dev, skb);
2366                         if (queue_index < 0)
2367                                 queue_index = skb_tx_hash(dev, skb);
2368
2369                         if (queue_index != old_index && sk) {
2370                                 struct dst_entry *dst =
2371                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2372
2373                                 if (dst && skb_dst(skb) == dst)
2374                                         sk_tx_queue_set(sk, queue_index);
2375                         }
2376                 }
2377         }
2378
2379         skb_set_queue_mapping(skb, queue_index);
2380         return netdev_get_tx_queue(dev, queue_index);
2381 }
2382
2383 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2384                                  struct net_device *dev,
2385                                  struct netdev_queue *txq)
2386 {
2387         spinlock_t *root_lock = qdisc_lock(q);
2388         bool contended;
2389         int rc;
2390
2391         qdisc_skb_cb(skb)->pkt_len = skb->len;
2392         qdisc_calculate_pkt_len(skb, q);
2393         /*
2394          * Heuristic to force contended enqueues to serialize on a
2395          * separate lock before trying to get qdisc main lock.
2396          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2397          * and dequeue packets faster.
2398          */
2399         contended = qdisc_is_running(q);
2400         if (unlikely(contended))
2401                 spin_lock(&q->busylock);
2402
2403         spin_lock(root_lock);
2404         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2405                 kfree_skb(skb);
2406                 rc = NET_XMIT_DROP;
2407         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2408                    qdisc_run_begin(q)) {
2409                 /*
2410                  * This is a work-conserving queue; there are no old skbs
2411                  * waiting to be sent out; and the qdisc is not running -
2412                  * xmit the skb directly.
2413                  */
2414                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2415                         skb_dst_force(skb);
2416
2417                 qdisc_bstats_update(q, skb);
2418
2419                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2420                         if (unlikely(contended)) {
2421                                 spin_unlock(&q->busylock);
2422                                 contended = false;
2423                         }
2424                         __qdisc_run(q);
2425                 } else
2426                         qdisc_run_end(q);
2427
2428                 rc = NET_XMIT_SUCCESS;
2429         } else {
2430                 skb_dst_force(skb);
2431                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2432                 if (qdisc_run_begin(q)) {
2433                         if (unlikely(contended)) {
2434                                 spin_unlock(&q->busylock);
2435                                 contended = false;
2436                         }
2437                         __qdisc_run(q);
2438                 }
2439         }
2440         spin_unlock(root_lock);
2441         if (unlikely(contended))
2442                 spin_unlock(&q->busylock);
2443         return rc;
2444 }
2445
2446 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2447 static void skb_update_prio(struct sk_buff *skb)
2448 {
2449         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2450
2451         if ((!skb->priority) && (skb->sk) && map)
2452                 skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx];
2453 }
2454 #else
2455 #define skb_update_prio(skb)
2456 #endif
2457
2458 static DEFINE_PER_CPU(int, xmit_recursion);
2459 #define RECURSION_LIMIT 10
2460
2461 /**
2462  *      dev_queue_xmit - transmit a buffer
2463  *      @skb: buffer to transmit
2464  *
2465  *      Queue a buffer for transmission to a network device. The caller must
2466  *      have set the device and priority and built the buffer before calling
2467  *      this function. The function can be called from an interrupt.
2468  *
2469  *      A negative errno code is returned on a failure. A success does not
2470  *      guarantee the frame will be transmitted as it may be dropped due
2471  *      to congestion or traffic shaping.
2472  *
2473  * -----------------------------------------------------------------------------------
2474  *      I notice this method can also return errors from the queue disciplines,
2475  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2476  *      be positive.
2477  *
2478  *      Regardless of the return value, the skb is consumed, so it is currently
2479  *      difficult to retry a send to this method.  (You can bump the ref count
2480  *      before sending to hold a reference for retry if you are careful.)
2481  *
2482  *      When calling this method, interrupts MUST be enabled.  This is because
2483  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2484  *          --BLG
2485  */
2486 int dev_queue_xmit(struct sk_buff *skb)
2487 {
2488         struct net_device *dev = skb->dev;
2489         struct netdev_queue *txq;
2490         struct Qdisc *q;
2491         int rc = -ENOMEM;
2492
2493         /* Disable soft irqs for various locks below. Also
2494          * stops preemption for RCU.
2495          */
2496         rcu_read_lock_bh();
2497
2498         skb_update_prio(skb);
2499
2500         txq = dev_pick_tx(dev, skb);
2501         q = rcu_dereference_bh(txq->qdisc);
2502
2503 #ifdef CONFIG_NET_CLS_ACT
2504         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2505 #endif
2506         trace_net_dev_queue(skb);
2507         if (q->enqueue) {
2508                 rc = __dev_xmit_skb(skb, q, dev, txq);
2509                 goto out;
2510         }
2511
2512         /* The device has no queue. Common case for software devices:
2513            loopback, all the sorts of tunnels...
2514
2515            Really, it is unlikely that netif_tx_lock protection is necessary
2516            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2517            counters.)
2518            However, it is possible, that they rely on protection
2519            made by us here.
2520
2521            Check this and shot the lock. It is not prone from deadlocks.
2522            Either shot noqueue qdisc, it is even simpler 8)
2523          */
2524         if (dev->flags & IFF_UP) {
2525                 int cpu = smp_processor_id(); /* ok because BHs are off */
2526
2527                 if (txq->xmit_lock_owner != cpu) {
2528
2529                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2530                                 goto recursion_alert;
2531
2532                         HARD_TX_LOCK(dev, txq, cpu);
2533
2534                         if (!netif_xmit_stopped(txq)) {
2535                                 __this_cpu_inc(xmit_recursion);
2536                                 rc = dev_hard_start_xmit(skb, dev, txq);
2537                                 __this_cpu_dec(xmit_recursion);
2538                                 if (dev_xmit_complete(rc)) {
2539                                         HARD_TX_UNLOCK(dev, txq);
2540                                         goto out;
2541                                 }
2542                         }
2543                         HARD_TX_UNLOCK(dev, txq);
2544                         if (net_ratelimit())
2545                                 pr_crit("Virtual device %s asks to queue packet!\n",
2546                                         dev->name);
2547                 } else {
2548                         /* Recursion is detected! It is possible,
2549                          * unfortunately
2550                          */
2551 recursion_alert:
2552                         if (net_ratelimit())
2553                                 pr_crit("Dead loop on virtual device %s, fix it urgently!\n",
2554                                         dev->name);
2555                 }
2556         }
2557
2558         rc = -ENETDOWN;
2559         rcu_read_unlock_bh();
2560
2561         kfree_skb(skb);
2562         return rc;
2563 out:
2564         rcu_read_unlock_bh();
2565         return rc;
2566 }
2567 EXPORT_SYMBOL(dev_queue_xmit);
2568
2569
2570 /*=======================================================================
2571                         Receiver routines
2572   =======================================================================*/
2573
2574 int netdev_max_backlog __read_mostly = 1000;
2575 int netdev_tstamp_prequeue __read_mostly = 1;
2576 int netdev_budget __read_mostly = 300;
2577 int weight_p __read_mostly = 64;            /* old backlog weight */
2578
2579 /* Called with irq disabled */
2580 static inline void ____napi_schedule(struct softnet_data *sd,
2581                                      struct napi_struct *napi)
2582 {
2583         list_add_tail(&napi->poll_list, &sd->poll_list);
2584         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2585 }
2586
2587 /*
2588  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2589  * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2590  * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2591  * if hash is a canonical 4-tuple hash over transport ports.
2592  */
2593 void __skb_get_rxhash(struct sk_buff *skb)
2594 {
2595         struct flow_keys keys;
2596         u32 hash;
2597
2598         if (!skb_flow_dissect(skb, &keys))
2599                 return;
2600
2601         if (keys.ports) {
2602                 if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
2603                         swap(keys.port16[0], keys.port16[1]);
2604                 skb->l4_rxhash = 1;
2605         }
2606
2607         /* get a consistent hash (same value on both flow directions) */
2608         if ((__force u32)keys.dst < (__force u32)keys.src)
2609                 swap(keys.dst, keys.src);
2610
2611         hash = jhash_3words((__force u32)keys.dst,
2612                             (__force u32)keys.src,
2613                             (__force u32)keys.ports, hashrnd);
2614         if (!hash)
2615                 hash = 1;
2616
2617         skb->rxhash = hash;
2618 }
2619 EXPORT_SYMBOL(__skb_get_rxhash);
2620
2621 #ifdef CONFIG_RPS
2622
2623 /* One global table that all flow-based protocols share. */
2624 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2625 EXPORT_SYMBOL(rps_sock_flow_table);
2626
2627 struct static_key rps_needed __read_mostly;
2628
2629 static struct rps_dev_flow *
2630 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2631             struct rps_dev_flow *rflow, u16 next_cpu)
2632 {
2633         if (next_cpu != RPS_NO_CPU) {
2634 #ifdef CONFIG_RFS_ACCEL
2635                 struct netdev_rx_queue *rxqueue;
2636                 struct rps_dev_flow_table *flow_table;
2637                 struct rps_dev_flow *old_rflow;
2638                 u32 flow_id;
2639                 u16 rxq_index;
2640                 int rc;
2641
2642                 /* Should we steer this flow to a different hardware queue? */
2643                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2644                     !(dev->features & NETIF_F_NTUPLE))
2645                         goto out;
2646                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2647                 if (rxq_index == skb_get_rx_queue(skb))
2648                         goto out;
2649
2650                 rxqueue = dev->_rx + rxq_index;
2651                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2652                 if (!flow_table)
2653                         goto out;
2654                 flow_id = skb->rxhash & flow_table->mask;
2655                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2656                                                         rxq_index, flow_id);
2657                 if (rc < 0)
2658                         goto out;
2659                 old_rflow = rflow;
2660                 rflow = &flow_table->flows[flow_id];
2661                 rflow->filter = rc;
2662                 if (old_rflow->filter == rflow->filter)
2663                         old_rflow->filter = RPS_NO_FILTER;
2664         out:
2665 #endif
2666                 rflow->last_qtail =
2667                         per_cpu(softnet_data, next_cpu).input_queue_head;
2668         }
2669
2670         rflow->cpu = next_cpu;
2671         return rflow;
2672 }
2673
2674 /*
2675  * get_rps_cpu is called from netif_receive_skb and returns the target
2676  * CPU from the RPS map of the receiving queue for a given skb.
2677  * rcu_read_lock must be held on entry.
2678  */
2679 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2680                        struct rps_dev_flow **rflowp)
2681 {
2682         struct netdev_rx_queue *rxqueue;
2683         struct rps_map *map;
2684         struct rps_dev_flow_table *flow_table;
2685         struct rps_sock_flow_table *sock_flow_table;
2686         int cpu = -1;
2687         u16 tcpu;
2688
2689         if (skb_rx_queue_recorded(skb)) {
2690                 u16 index = skb_get_rx_queue(skb);
2691                 if (unlikely(index >= dev->real_num_rx_queues)) {
2692                         WARN_ONCE(dev->real_num_rx_queues > 1,
2693                                   "%s received packet on queue %u, but number "
2694                                   "of RX queues is %u\n",
2695                                   dev->name, index, dev->real_num_rx_queues);
2696                         goto done;
2697                 }
2698                 rxqueue = dev->_rx + index;
2699         } else
2700                 rxqueue = dev->_rx;
2701
2702         map = rcu_dereference(rxqueue->rps_map);
2703         if (map) {
2704                 if (map->len == 1 &&
2705                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2706                         tcpu = map->cpus[0];
2707                         if (cpu_online(tcpu))
2708                                 cpu = tcpu;
2709                         goto done;
2710                 }
2711         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2712                 goto done;
2713         }
2714
2715         skb_reset_network_header(skb);
2716         if (!skb_get_rxhash(skb))
2717                 goto done;
2718
2719         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2720         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2721         if (flow_table && sock_flow_table) {
2722                 u16 next_cpu;
2723                 struct rps_dev_flow *rflow;
2724
2725                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2726                 tcpu = rflow->cpu;
2727
2728                 next_cpu = sock_flow_table->ents[skb->rxhash &
2729                     sock_flow_table->mask];
2730
2731                 /*
2732                  * If the desired CPU (where last recvmsg was done) is
2733                  * different from current CPU (one in the rx-queue flow
2734                  * table entry), switch if one of the following holds:
2735                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2736                  *   - Current CPU is offline.
2737                  *   - The current CPU's queue tail has advanced beyond the
2738                  *     last packet that was enqueued using this table entry.
2739                  *     This guarantees that all previous packets for the flow
2740                  *     have been dequeued, thus preserving in order delivery.
2741                  */
2742                 if (unlikely(tcpu != next_cpu) &&
2743                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2744                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2745                       rflow->last_qtail)) >= 0))
2746                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2747
2748                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2749                         *rflowp = rflow;
2750                         cpu = tcpu;
2751                         goto done;
2752                 }
2753         }
2754
2755         if (map) {
2756                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2757
2758                 if (cpu_online(tcpu)) {
2759                         cpu = tcpu;
2760                         goto done;
2761                 }
2762         }
2763
2764 done:
2765         return cpu;
2766 }
2767
2768 #ifdef CONFIG_RFS_ACCEL
2769
2770 /**
2771  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2772  * @dev: Device on which the filter was set
2773  * @rxq_index: RX queue index
2774  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2775  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2776  *
2777  * Drivers that implement ndo_rx_flow_steer() should periodically call
2778  * this function for each installed filter and remove the filters for
2779  * which it returns %true.
2780  */
2781 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2782                          u32 flow_id, u16 filter_id)
2783 {
2784         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2785         struct rps_dev_flow_table *flow_table;
2786         struct rps_dev_flow *rflow;
2787         bool expire = true;
2788         int cpu;
2789
2790         rcu_read_lock();
2791         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2792         if (flow_table && flow_id <= flow_table->mask) {
2793                 rflow = &flow_table->flows[flow_id];
2794                 cpu = ACCESS_ONCE(rflow->cpu);
2795                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2796                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2797                            rflow->last_qtail) <
2798                      (int)(10 * flow_table->mask)))
2799                         expire = false;
2800         }
2801         rcu_read_unlock();
2802         return expire;
2803 }
2804 EXPORT_SYMBOL(rps_may_expire_flow);
2805
2806 #endif /* CONFIG_RFS_ACCEL */
2807
2808 /* Called from hardirq (IPI) context */
2809 static void rps_trigger_softirq(void *data)
2810 {
2811         struct softnet_data *sd = data;
2812
2813         ____napi_schedule(sd, &sd->backlog);
2814         sd->received_rps++;
2815 }
2816
2817 #endif /* CONFIG_RPS */
2818
2819 /*
2820  * Check if this softnet_data structure is another cpu one
2821  * If yes, queue it to our IPI list and return 1
2822  * If no, return 0
2823  */
2824 static int rps_ipi_queued(struct softnet_data *sd)
2825 {
2826 #ifdef CONFIG_RPS
2827         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2828
2829         if (sd != mysd) {
2830                 sd->rps_ipi_next = mysd->rps_ipi_list;
2831                 mysd->rps_ipi_list = sd;
2832
2833                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2834                 return 1;
2835         }
2836 #endif /* CONFIG_RPS */
2837         return 0;
2838 }
2839
2840 /*
2841  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2842  * queue (may be a remote CPU queue).
2843  */
2844 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2845                               unsigned int *qtail)
2846 {
2847         struct softnet_data *sd;
2848         unsigned long flags;
2849
2850         sd = &per_cpu(softnet_data, cpu);
2851
2852         local_irq_save(flags);
2853
2854         rps_lock(sd);
2855         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2856                 if (skb_queue_len(&sd->input_pkt_queue)) {
2857 enqueue:
2858                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2859                         input_queue_tail_incr_save(sd, qtail);
2860                         rps_unlock(sd);
2861                         local_irq_restore(flags);
2862                         return NET_RX_SUCCESS;
2863                 }
2864
2865                 /* Schedule NAPI for backlog device
2866                  * We can use non atomic operation since we own the queue lock
2867                  */
2868                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2869                         if (!rps_ipi_queued(sd))
2870                                 ____napi_schedule(sd, &sd->backlog);
2871                 }
2872                 goto enqueue;
2873         }
2874
2875         sd->dropped++;
2876         rps_unlock(sd);
2877
2878         local_irq_restore(flags);
2879
2880         atomic_long_inc(&skb->dev->rx_dropped);
2881         kfree_skb(skb);
2882         return NET_RX_DROP;
2883 }
2884
2885 /**
2886  *      netif_rx        -       post buffer to the network code
2887  *      @skb: buffer to post
2888  *
2889  *      This function receives a packet from a device driver and queues it for
2890  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2891  *      may be dropped during processing for congestion control or by the
2892  *      protocol layers.
2893  *
2894  *      return values:
2895  *      NET_RX_SUCCESS  (no congestion)
2896  *      NET_RX_DROP     (packet was dropped)
2897  *
2898  */
2899
2900 int netif_rx(struct sk_buff *skb)
2901 {
2902         int ret;
2903
2904         /* if netpoll wants it, pretend we never saw it */
2905         if (netpoll_rx(skb))
2906                 return NET_RX_DROP;
2907
2908         net_timestamp_check(netdev_tstamp_prequeue, skb);
2909
2910         trace_netif_rx(skb);
2911 #ifdef CONFIG_RPS
2912         if (static_key_false(&rps_needed)) {
2913                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2914                 int cpu;
2915
2916                 preempt_disable();
2917                 rcu_read_lock();
2918
2919                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2920                 if (cpu < 0)
2921                         cpu = smp_processor_id();
2922
2923                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2924
2925                 rcu_read_unlock();
2926                 preempt_enable();
2927         } else
2928 #endif
2929         {
2930                 unsigned int qtail;
2931                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2932                 put_cpu();
2933         }
2934         return ret;
2935 }
2936 EXPORT_SYMBOL(netif_rx);
2937
2938 int netif_rx_ni(struct sk_buff *skb)
2939 {
2940         int err;
2941
2942         preempt_disable();
2943         err = netif_rx(skb);
2944         if (local_softirq_pending())
2945                 do_softirq();
2946         preempt_enable();
2947
2948         return err;
2949 }
2950 EXPORT_SYMBOL(netif_rx_ni);
2951
2952 static void net_tx_action(struct softirq_action *h)
2953 {
2954         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2955
2956         if (sd->completion_queue) {
2957                 struct sk_buff *clist;
2958
2959                 local_irq_disable();
2960                 clist = sd->completion_queue;
2961                 sd->completion_queue = NULL;
2962                 local_irq_enable();
2963
2964                 while (clist) {
2965                         struct sk_buff *skb = clist;
2966                         clist = clist->next;
2967
2968                         WARN_ON(atomic_read(&skb->users));
2969                         trace_kfree_skb(skb, net_tx_action);
2970                         __kfree_skb(skb);
2971                 }
2972         }
2973
2974         if (sd->output_queue) {
2975                 struct Qdisc *head;
2976
2977                 local_irq_disable();
2978                 head = sd->output_queue;
2979                 sd->output_queue = NULL;
2980                 sd->output_queue_tailp = &sd->output_queue;
2981                 local_irq_enable();
2982
2983                 while (head) {
2984                         struct Qdisc *q = head;
2985                         spinlock_t *root_lock;
2986
2987                         head = head->next_sched;
2988
2989                         root_lock = qdisc_lock(q);
2990                         if (spin_trylock(root_lock)) {
2991                                 smp_mb__before_clear_bit();
2992                                 clear_bit(__QDISC_STATE_SCHED,
2993                                           &q->state);
2994                                 qdisc_run(q);
2995                                 spin_unlock(root_lock);
2996                         } else {
2997                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2998                                               &q->state)) {
2999                                         __netif_reschedule(q);
3000                                 } else {
3001                                         smp_mb__before_clear_bit();
3002                                         clear_bit(__QDISC_STATE_SCHED,
3003                                                   &q->state);
3004                                 }
3005                         }
3006                 }
3007         }
3008 }
3009
3010 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3011     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3012 /* This hook is defined here for ATM LANE */
3013 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3014                              unsigned char *addr) __read_mostly;
3015 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3016 #endif
3017
3018 #ifdef CONFIG_NET_CLS_ACT
3019 /* TODO: Maybe we should just force sch_ingress to be compiled in
3020  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3021  * a compare and 2 stores extra right now if we dont have it on
3022  * but have CONFIG_NET_CLS_ACT
3023  * NOTE: This doesn't stop any functionality; if you dont have
3024  * the ingress scheduler, you just can't add policies on ingress.
3025  *
3026  */
3027 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3028 {
3029         struct net_device *dev = skb->dev;
3030         u32 ttl = G_TC_RTTL(skb->tc_verd);
3031         int result = TC_ACT_OK;
3032         struct Qdisc *q;
3033
3034         if (unlikely(MAX_RED_LOOP < ttl++)) {
3035                 if (net_ratelimit())
3036                         pr_warn("Redir loop detected Dropping packet (%d->%d)\n",
3037                                 skb->skb_iif, dev->ifindex);
3038                 return TC_ACT_SHOT;
3039         }
3040
3041         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3042         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3043
3044         q = rxq->qdisc;
3045         if (q != &noop_qdisc) {
3046                 spin_lock(qdisc_lock(q));
3047                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3048                         result = qdisc_enqueue_root(skb, q);
3049                 spin_unlock(qdisc_lock(q));
3050         }
3051
3052         return result;
3053 }
3054
3055 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3056                                          struct packet_type **pt_prev,
3057                                          int *ret, struct net_device *orig_dev)
3058 {
3059         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3060
3061         if (!rxq || rxq->qdisc == &noop_qdisc)
3062                 goto out;
3063
3064         if (*pt_prev) {
3065                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3066                 *pt_prev = NULL;
3067         }
3068
3069         switch (ing_filter(skb, rxq)) {
3070         case TC_ACT_SHOT:
3071         case TC_ACT_STOLEN:
3072                 kfree_skb(skb);
3073                 return NULL;
3074         }
3075
3076 out:
3077         skb->tc_verd = 0;
3078         return skb;
3079 }
3080 #endif
3081
3082 /**
3083  *      netdev_rx_handler_register - register receive handler
3084  *      @dev: device to register a handler for
3085  *      @rx_handler: receive handler to register
3086  *      @rx_handler_data: data pointer that is used by rx handler
3087  *
3088  *      Register a receive hander for a device. This handler will then be
3089  *      called from __netif_receive_skb. A negative errno code is returned
3090  *      on a failure.
3091  *
3092  *      The caller must hold the rtnl_mutex.
3093  *
3094  *      For a general description of rx_handler, see enum rx_handler_result.
3095  */
3096 int netdev_rx_handler_register(struct net_device *dev,
3097                                rx_handler_func_t *rx_handler,
3098                                void *rx_handler_data)
3099 {
3100         ASSERT_RTNL();
3101
3102         if (dev->rx_handler)
3103                 return -EBUSY;
3104
3105         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3106         rcu_assign_pointer(dev->rx_handler, rx_handler);
3107
3108         return 0;
3109 }
3110 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3111
3112 /**
3113  *      netdev_rx_handler_unregister - unregister receive handler
3114  *      @dev: device to unregister a handler from
3115  *
3116  *      Unregister a receive hander from a device.
3117  *
3118  *      The caller must hold the rtnl_mutex.
3119  */
3120 void netdev_rx_handler_unregister(struct net_device *dev)
3121 {
3122
3123         ASSERT_RTNL();
3124         RCU_INIT_POINTER(dev->rx_handler, NULL);
3125         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3126 }
3127 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3128
3129 static int __netif_receive_skb(struct sk_buff *skb)
3130 {
3131         struct packet_type *ptype, *pt_prev;
3132         rx_handler_func_t *rx_handler;
3133         struct net_device *orig_dev;
3134         struct net_device *null_or_dev;
3135         bool deliver_exact = false;
3136         int ret = NET_RX_DROP;
3137         __be16 type;
3138
3139         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3140
3141         trace_netif_receive_skb(skb);
3142
3143         /* if we've gotten here through NAPI, check netpoll */
3144         if (netpoll_receive_skb(skb))
3145                 return NET_RX_DROP;
3146
3147         if (!skb->skb_iif)
3148                 skb->skb_iif = skb->dev->ifindex;
3149         orig_dev = skb->dev;
3150
3151         skb_reset_network_header(skb);
3152         skb_reset_transport_header(skb);
3153         skb_reset_mac_len(skb);
3154
3155         pt_prev = NULL;
3156
3157         rcu_read_lock();
3158
3159 another_round:
3160
3161         __this_cpu_inc(softnet_data.processed);
3162
3163         if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3164                 skb = vlan_untag(skb);
3165                 if (unlikely(!skb))
3166                         goto out;
3167         }
3168
3169 #ifdef CONFIG_NET_CLS_ACT
3170         if (skb->tc_verd & TC_NCLS) {
3171                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3172                 goto ncls;
3173         }
3174 #endif
3175
3176         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3177                 if (!ptype->dev || ptype->dev == skb->dev) {
3178                         if (pt_prev)
3179                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3180                         pt_prev = ptype;
3181                 }
3182         }
3183
3184 #ifdef CONFIG_NET_CLS_ACT
3185         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3186         if (!skb)
3187                 goto out;
3188 ncls:
3189 #endif
3190
3191         rx_handler = rcu_dereference(skb->dev->rx_handler);
3192         if (vlan_tx_tag_present(skb)) {
3193                 if (pt_prev) {
3194                         ret = deliver_skb(skb, pt_prev, orig_dev);
3195                         pt_prev = NULL;
3196                 }
3197                 if (vlan_do_receive(&skb, !rx_handler))
3198                         goto another_round;
3199                 else if (unlikely(!skb))
3200                         goto out;
3201         }
3202
3203         if (rx_handler) {
3204                 if (pt_prev) {
3205                         ret = deliver_skb(skb, pt_prev, orig_dev);
3206                         pt_prev = NULL;
3207                 }
3208                 switch (rx_handler(&skb)) {
3209                 case RX_HANDLER_CONSUMED:
3210                         goto out;
3211                 case RX_HANDLER_ANOTHER:
3212                         goto another_round;
3213                 case RX_HANDLER_EXACT:
3214                         deliver_exact = true;
3215                 case RX_HANDLER_PASS:
3216                         break;
3217                 default:
3218                         BUG();
3219                 }
3220         }
3221
3222         /* deliver only exact match when indicated */
3223         null_or_dev = deliver_exact ? skb->dev : NULL;
3224
3225         type = skb->protocol;
3226         list_for_each_entry_rcu(ptype,
3227                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3228                 if (ptype->type == type &&
3229                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3230                      ptype->dev == orig_dev)) {
3231                         if (pt_prev)
3232                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3233                         pt_prev = ptype;
3234                 }
3235         }
3236
3237         if (pt_prev) {
3238                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3239         } else {
3240                 atomic_long_inc(&skb->dev->rx_dropped);
3241                 kfree_skb(skb);
3242                 /* Jamal, now you will not able to escape explaining
3243                  * me how you were going to use this. :-)
3244                  */
3245                 ret = NET_RX_DROP;
3246         }
3247
3248 out:
3249         rcu_read_unlock();
3250         return ret;
3251 }
3252
3253 /**
3254  *      netif_receive_skb - process receive buffer from network
3255  *      @skb: buffer to process
3256  *
3257  *      netif_receive_skb() is the main receive data processing function.
3258  *      It always succeeds. The buffer may be dropped during processing
3259  *      for congestion control or by the protocol layers.
3260  *
3261  *      This function may only be called from softirq context and interrupts
3262  *      should be enabled.
3263  *
3264  *      Return values (usually ignored):
3265  *      NET_RX_SUCCESS: no congestion
3266  *      NET_RX_DROP: packet was dropped
3267  */
3268 int netif_receive_skb(struct sk_buff *skb)
3269 {
3270         net_timestamp_check(netdev_tstamp_prequeue, skb);
3271
3272         if (skb_defer_rx_timestamp(skb))
3273                 return NET_RX_SUCCESS;
3274
3275 #ifdef CONFIG_RPS
3276         if (static_key_false(&rps_needed)) {
3277                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3278                 int cpu, ret;
3279
3280                 rcu_read_lock();
3281
3282                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3283
3284                 if (cpu >= 0) {
3285                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3286                         rcu_read_unlock();
3287                         return ret;
3288                 }
3289                 rcu_read_unlock();
3290         }
3291 #endif
3292         return __netif_receive_skb(skb);
3293 }
3294 EXPORT_SYMBOL(netif_receive_skb);
3295
3296 /* Network device is going away, flush any packets still pending
3297  * Called with irqs disabled.
3298  */
3299 static void flush_backlog(void *arg)
3300 {
3301         struct net_device *dev = arg;
3302         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3303         struct sk_buff *skb, *tmp;
3304
3305         rps_lock(sd);
3306         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3307                 if (skb->dev == dev) {
3308                         __skb_unlink(skb, &sd->input_pkt_queue);
3309                         kfree_skb(skb);
3310                         input_queue_head_incr(sd);
3311                 }
3312         }
3313         rps_unlock(sd);
3314
3315         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3316                 if (skb->dev == dev) {
3317                         __skb_unlink(skb, &sd->process_queue);
3318                         kfree_skb(skb);
3319                         input_queue_head_incr(sd);
3320                 }
3321         }
3322 }
3323
3324 static int napi_gro_complete(struct sk_buff *skb)
3325 {
3326         struct packet_type *ptype;
3327         __be16 type = skb->protocol;
3328         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3329         int err = -ENOENT;
3330
3331         if (NAPI_GRO_CB(skb)->count == 1) {
3332                 skb_shinfo(skb)->gso_size = 0;
3333                 goto out;
3334         }
3335
3336         rcu_read_lock();
3337         list_for_each_entry_rcu(ptype, head, list) {
3338                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3339                         continue;
3340
3341                 err = ptype->gro_complete(skb);
3342                 break;
3343         }
3344         rcu_read_unlock();
3345
3346         if (err) {
3347                 WARN_ON(&ptype->list == head);
3348                 kfree_skb(skb);
3349                 return NET_RX_SUCCESS;
3350         }
3351
3352 out:
3353         return netif_receive_skb(skb);
3354 }
3355
3356 inline void napi_gro_flush(struct napi_struct *napi)
3357 {
3358         struct sk_buff *skb, *next;
3359
3360         for (skb = napi->gro_list; skb; skb = next) {
3361                 next = skb->next;
3362                 skb->next = NULL;
3363                 napi_gro_complete(skb);
3364         }
3365
3366         napi->gro_count = 0;
3367         napi->gro_list = NULL;
3368 }
3369 EXPORT_SYMBOL(napi_gro_flush);
3370
3371 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3372 {
3373         struct sk_buff **pp = NULL;
3374         struct packet_type *ptype;
3375         __be16 type = skb->protocol;
3376         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3377         int same_flow;
3378         int mac_len;
3379         enum gro_result ret;
3380
3381         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3382                 goto normal;
3383
3384         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3385                 goto normal;
3386
3387         rcu_read_lock();
3388         list_for_each_entry_rcu(ptype, head, list) {
3389                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3390                         continue;
3391
3392                 skb_set_network_header(skb, skb_gro_offset(skb));
3393                 mac_len = skb->network_header - skb->mac_header;
3394                 skb->mac_len = mac_len;
3395                 NAPI_GRO_CB(skb)->same_flow = 0;
3396                 NAPI_GRO_CB(skb)->flush = 0;
3397                 NAPI_GRO_CB(skb)->free = 0;
3398
3399                 pp = ptype->gro_receive(&napi->gro_list, skb);
3400                 break;
3401         }
3402         rcu_read_unlock();
3403
3404         if (&ptype->list == head)
3405                 goto normal;
3406
3407         same_flow = NAPI_GRO_CB(skb)->same_flow;
3408         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3409
3410         if (pp) {
3411                 struct sk_buff *nskb = *pp;
3412
3413                 *pp = nskb->next;
3414                 nskb->next = NULL;
3415                 napi_gro_complete(nskb);
3416                 napi->gro_count--;
3417         }
3418
3419         if (same_flow)
3420                 goto ok;
3421
3422         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3423                 goto normal;
3424
3425         napi->gro_count++;
3426         NAPI_GRO_CB(skb)->count = 1;
3427         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3428         skb->next = napi->gro_list;
3429         napi->gro_list = skb;
3430         ret = GRO_HELD;
3431
3432 pull:
3433         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3434                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3435
3436                 BUG_ON(skb->end - skb->tail < grow);
3437
3438                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3439
3440                 skb->tail += grow;
3441                 skb->data_len -= grow;
3442
3443                 skb_shinfo(skb)->frags[0].page_offset += grow;
3444                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3445
3446                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3447                         skb_frag_unref(skb, 0);
3448                         memmove(skb_shinfo(skb)->frags,
3449                                 skb_shinfo(skb)->frags + 1,
3450                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3451                 }
3452         }
3453
3454 ok:
3455         return ret;
3456
3457 normal:
3458         ret = GRO_NORMAL;
3459         goto pull;
3460 }
3461 EXPORT_SYMBOL(dev_gro_receive);
3462
3463 static inline gro_result_t
3464 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3465 {
3466         struct sk_buff *p;
3467         unsigned int maclen = skb->dev->hard_header_len;
3468
3469         for (p = napi->gro_list; p; p = p->next) {
3470                 unsigned long diffs;
3471
3472                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3473                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3474                 if (maclen == ETH_HLEN)
3475                         diffs |= compare_ether_header(skb_mac_header(p),
3476                                                       skb_gro_mac_header(skb));
3477                 else if (!diffs)
3478                         diffs = memcmp(skb_mac_header(p),
3479                                        skb_gro_mac_header(skb),
3480                                        maclen);
3481                 NAPI_GRO_CB(p)->same_flow = !diffs;
3482                 NAPI_GRO_CB(p)->flush = 0;
3483         }
3484
3485         return dev_gro_receive(napi, skb);
3486 }
3487
3488 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3489 {
3490         switch (ret) {
3491         case GRO_NORMAL:
3492                 if (netif_receive_skb(skb))
3493                         ret = GRO_DROP;
3494                 break;
3495
3496         case GRO_DROP:
3497         case GRO_MERGED_FREE:
3498                 kfree_skb(skb);
3499                 break;
3500
3501         case GRO_HELD:
3502         case GRO_MERGED:
3503                 break;
3504         }
3505
3506         return ret;
3507 }
3508 EXPORT_SYMBOL(napi_skb_finish);
3509
3510 void skb_gro_reset_offset(struct sk_buff *skb)
3511 {
3512         NAPI_GRO_CB(skb)->data_offset = 0;
3513         NAPI_GRO_CB(skb)->frag0 = NULL;
3514         NAPI_GRO_CB(skb)->frag0_len = 0;
3515
3516         if (skb->mac_header == skb->tail &&
3517             !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3518                 NAPI_GRO_CB(skb)->frag0 =
3519                         skb_frag_address(&skb_shinfo(skb)->frags[0]);
3520                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3521         }
3522 }
3523 EXPORT_SYMBOL(skb_gro_reset_offset);
3524
3525 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3526 {
3527         skb_gro_reset_offset(skb);
3528
3529         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3530 }
3531 EXPORT_SYMBOL(napi_gro_receive);
3532
3533 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3534 {
3535         __skb_pull(skb, skb_headlen(skb));
3536         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3537         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3538         skb->vlan_tci = 0;
3539         skb->dev = napi->dev;
3540         skb->skb_iif = 0;
3541
3542         napi->skb = skb;
3543 }
3544
3545 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3546 {
3547         struct sk_buff *skb = napi->skb;
3548
3549         if (!skb) {
3550                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3551                 if (skb)
3552                         napi->skb = skb;
3553         }
3554         return skb;
3555 }
3556 EXPORT_SYMBOL(napi_get_frags);
3557
3558 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3559                                gro_result_t ret)
3560 {
3561         switch (ret) {
3562         case GRO_NORMAL:
3563         case GRO_HELD:
3564                 skb->protocol = eth_type_trans(skb, skb->dev);
3565
3566                 if (ret == GRO_HELD)
3567                         skb_gro_pull(skb, -ETH_HLEN);
3568                 else if (netif_receive_skb(skb))
3569                         ret = GRO_DROP;
3570                 break;
3571
3572         case GRO_DROP:
3573         case GRO_MERGED_FREE:
3574                 napi_reuse_skb(napi, skb);
3575                 break;
3576
3577         case GRO_MERGED:
3578                 break;
3579         }
3580
3581         return ret;
3582 }
3583 EXPORT_SYMBOL(napi_frags_finish);
3584
3585 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3586 {
3587         struct sk_buff *skb = napi->skb;
3588         struct ethhdr *eth;
3589         unsigned int hlen;
3590         unsigned int off;
3591
3592         napi->skb = NULL;
3593
3594         skb_reset_mac_header(skb);
3595         skb_gro_reset_offset(skb);
3596
3597         off = skb_gro_offset(skb);
3598         hlen = off + sizeof(*eth);
3599         eth = skb_gro_header_fast(skb, off);
3600         if (skb_gro_header_hard(skb, hlen)) {
3601                 eth = skb_gro_header_slow(skb, hlen, off);
3602                 if (unlikely(!eth)) {
3603                         napi_reuse_skb(napi, skb);
3604                         skb = NULL;
3605                         goto out;
3606                 }
3607         }
3608
3609         skb_gro_pull(skb, sizeof(*eth));
3610
3611         /*
3612          * This works because the only protocols we care about don't require
3613          * special handling.  We'll fix it up properly at the end.
3614          */
3615         skb->protocol = eth->h_proto;
3616
3617 out:
3618         return skb;
3619 }
3620 EXPORT_SYMBOL(napi_frags_skb);
3621
3622 gro_result_t napi_gro_frags(struct napi_struct *napi)
3623 {
3624         struct sk_buff *skb = napi_frags_skb(napi);
3625
3626         if (!skb)
3627                 return GRO_DROP;
3628
3629         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3630 }
3631 EXPORT_SYMBOL(napi_gro_frags);
3632
3633 /*
3634  * net_rps_action sends any pending IPI's for rps.
3635  * Note: called with local irq disabled, but exits with local irq enabled.
3636  */
3637 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3638 {
3639 #ifdef CONFIG_RPS
3640         struct softnet_data *remsd = sd->rps_ipi_list;
3641
3642         if (remsd) {
3643                 sd->rps_ipi_list = NULL;
3644
3645                 local_irq_enable();
3646
3647                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3648                 while (remsd) {
3649                         struct softnet_data *next = remsd->rps_ipi_next;
3650
3651                         if (cpu_online(remsd->cpu))
3652                                 __smp_call_function_single(remsd->cpu,
3653                                                            &remsd->csd, 0);
3654                         remsd = next;
3655                 }
3656         } else
3657 #endif
3658                 local_irq_enable();
3659 }
3660
3661 static int process_backlog(struct napi_struct *napi, int quota)
3662 {
3663         int work = 0;
3664         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3665
3666 #ifdef CONFIG_RPS
3667         /* Check if we have pending ipi, its better to send them now,
3668          * not waiting net_rx_action() end.
3669          */
3670         if (sd->rps_ipi_list) {
3671                 local_irq_disable();
3672                 net_rps_action_and_irq_enable(sd);
3673         }
3674 #endif
3675         napi->weight = weight_p;
3676         local_irq_disable();
3677         while (work < quota) {
3678                 struct sk_buff *skb;
3679                 unsigned int qlen;
3680
3681                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3682                         local_irq_enable();
3683                         __netif_receive_skb(skb);
3684                         local_irq_disable();
3685                         input_queue_head_incr(sd);
3686                         if (++work >= quota) {
3687                                 local_irq_enable();
3688                                 return work;
3689                         }
3690                 }
3691
3692                 rps_lock(sd);
3693                 qlen = skb_queue_len(&sd->input_pkt_queue);
3694                 if (qlen)
3695                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3696                                                    &sd->process_queue);
3697
3698                 if (qlen < quota - work) {
3699                         /*
3700                          * Inline a custom version of __napi_complete().
3701                          * only current cpu owns and manipulates this napi,
3702                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3703                          * we can use a plain write instead of clear_bit(),
3704                          * and we dont need an smp_mb() memory barrier.
3705                          */
3706                         list_del(&napi->poll_list);
3707                         napi->state = 0;
3708
3709                         quota = work + qlen;
3710                 }
3711                 rps_unlock(sd);
3712         }
3713         local_irq_enable();
3714
3715         return work;
3716 }
3717
3718 /**
3719  * __napi_schedule - schedule for receive
3720  * @n: entry to schedule
3721  *
3722  * The entry's receive function will be scheduled to run
3723  */
3724 void __napi_schedule(struct napi_struct *n)
3725 {
3726         unsigned long flags;
3727
3728         local_irq_save(flags);
3729         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3730         local_irq_restore(flags);
3731 }
3732 EXPORT_SYMBOL(__napi_schedule);
3733
3734 void __napi_complete(struct napi_struct *n)
3735 {
3736         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3737         BUG_ON(n->gro_list);
3738
3739         list_del(&n->poll_list);
3740         smp_mb__before_clear_bit();
3741         clear_bit(NAPI_STATE_SCHED, &n->state);
3742 }
3743 EXPORT_SYMBOL(__napi_complete);
3744
3745 void napi_complete(struct napi_struct *n)
3746 {
3747         unsigned long flags;
3748
3749         /*
3750          * don't let napi dequeue from the cpu poll list
3751          * just in case its running on a different cpu
3752          */
3753         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3754                 return;
3755
3756         napi_gro_flush(n);
3757         local_irq_save(flags);
3758         __napi_complete(n);
3759         local_irq_restore(flags);
3760 }
3761 EXPORT_SYMBOL(napi_complete);
3762
3763 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3764                     int (*poll)(struct napi_struct *, int), int weight)
3765 {
3766         INIT_LIST_HEAD(&napi->poll_list);
3767         napi->gro_count = 0;
3768         napi->gro_list = NULL;
3769         napi->skb = NULL;
3770         napi->poll = poll;
3771         napi->weight = weight;
3772         list_add(&napi->dev_list, &dev->napi_list);
3773         napi->dev = dev;
3774 #ifdef CONFIG_NETPOLL
3775         spin_lock_init(&napi->poll_lock);
3776         napi->poll_owner = -1;
3777 #endif
3778         set_bit(NAPI_STATE_SCHED, &napi->state);
3779 }
3780 EXPORT_SYMBOL(netif_napi_add);
3781
3782 void netif_napi_del(struct napi_struct *napi)
3783 {
3784         struct sk_buff *skb, *next;
3785
3786         list_del_init(&napi->dev_list);
3787         napi_free_frags(napi);
3788
3789         for (skb = napi->gro_list; skb; skb = next) {
3790                 next = skb->next;
3791                 skb->next = NULL;
3792                 kfree_skb(skb);
3793         }
3794
3795         napi->gro_list = NULL;
3796         napi->gro_count = 0;
3797 }
3798 EXPORT_SYMBOL(netif_napi_del);
3799
3800 static void net_rx_action(struct softirq_action *h)
3801 {
3802         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3803         unsigned long time_limit = jiffies + 2;
3804         int budget = netdev_budget;
3805         void *have;
3806
3807         local_irq_disable();
3808
3809         while (!list_empty(&sd->poll_list)) {
3810                 struct napi_struct *n;
3811                 int work, weight;
3812
3813                 /* If softirq window is exhuasted then punt.
3814                  * Allow this to run for 2 jiffies since which will allow
3815                  * an average latency of 1.5/HZ.
3816                  */
3817                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3818                         goto softnet_break;
3819
3820                 local_irq_enable();
3821
3822                 /* Even though interrupts have been re-enabled, this
3823                  * access is safe because interrupts can only add new
3824                  * entries to the tail of this list, and only ->poll()
3825                  * calls can remove this head entry from the list.
3826                  */
3827                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3828
3829                 have = netpoll_poll_lock(n);
3830
3831                 weight = n->weight;
3832
3833                 /* This NAPI_STATE_SCHED test is for avoiding a race
3834                  * with netpoll's poll_napi().  Only the entity which
3835                  * obtains the lock and sees NAPI_STATE_SCHED set will
3836                  * actually make the ->poll() call.  Therefore we avoid
3837                  * accidentally calling ->poll() when NAPI is not scheduled.
3838                  */
3839                 work = 0;
3840                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3841                         work = n->poll(n, weight);
3842                         trace_napi_poll(n);
3843                 }
3844
3845                 WARN_ON_ONCE(work > weight);
3846
3847                 budget -= work;
3848
3849                 local_irq_disable();
3850
3851                 /* Drivers must not modify the NAPI state if they
3852                  * consume the entire weight.  In such cases this code
3853                  * still "owns" the NAPI instance and therefore can
3854                  * move the instance around on the list at-will.
3855                  */
3856                 if (unlikely(work == weight)) {
3857                         if (unlikely(napi_disable_pending(n))) {
3858                                 local_irq_enable();
3859                                 napi_complete(n);
3860                                 local_irq_disable();
3861                         } else
3862                                 list_move_tail(&n->poll_list, &sd->poll_list);
3863                 }
3864
3865                 netpoll_poll_unlock(have);
3866         }
3867 out:
3868         net_rps_action_and_irq_enable(sd);
3869
3870 #ifdef CONFIG_NET_DMA
3871         /*
3872          * There may not be any more sk_buffs coming right now, so push
3873          * any pending DMA copies to hardware
3874          */
3875         dma_issue_pending_all();
3876 #endif
3877
3878         return;
3879
3880 softnet_break:
3881         sd->time_squeeze++;
3882         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3883         goto out;
3884 }
3885
3886 static gifconf_func_t *gifconf_list[NPROTO];
3887
3888 /**
3889  *      register_gifconf        -       register a SIOCGIF handler
3890  *      @family: Address family
3891  *      @gifconf: Function handler
3892  *
3893  *      Register protocol dependent address dumping routines. The handler
3894  *      that is passed must not be freed or reused until it has been replaced
3895  *      by another handler.
3896  */
3897 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3898 {
3899         if (family >= NPROTO)
3900                 return -EINVAL;
3901         gifconf_list[family] = gifconf;
3902         return 0;
3903 }
3904 EXPORT_SYMBOL(register_gifconf);
3905
3906
3907 /*
3908  *      Map an interface index to its name (SIOCGIFNAME)
3909  */
3910
3911 /*
3912  *      We need this ioctl for efficient implementation of the
3913  *      if_indextoname() function required by the IPv6 API.  Without
3914  *      it, we would have to search all the interfaces to find a
3915  *      match.  --pb
3916  */
3917
3918 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3919 {
3920         struct net_device *dev;
3921         struct ifreq ifr;
3922
3923         /*
3924          *      Fetch the caller's info block.
3925          */
3926
3927         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3928                 return -EFAULT;
3929
3930         rcu_read_lock();
3931         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3932         if (!dev) {
3933                 rcu_read_unlock();
3934                 return -ENODEV;
3935         }
3936
3937         strcpy(ifr.ifr_name, dev->name);
3938         rcu_read_unlock();
3939
3940         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3941                 return -EFAULT;
3942         return 0;
3943 }
3944
3945 /*
3946  *      Perform a SIOCGIFCONF call. This structure will change
3947  *      size eventually, and there is nothing I can do about it.
3948  *      Thus we will need a 'compatibility mode'.
3949  */
3950
3951 static int dev_ifconf(struct net *net, char __user *arg)
3952 {
3953         struct ifconf ifc;
3954         struct net_device *dev;
3955         char __user *pos;
3956         int len;
3957         int total;
3958         int i;
3959
3960         /*
3961          *      Fetch the caller's info block.
3962          */
3963
3964         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3965                 return -EFAULT;
3966
3967         pos = ifc.ifc_buf;
3968         len = ifc.ifc_len;
3969
3970         /*
3971          *      Loop over the interfaces, and write an info block for each.
3972          */
3973
3974         total = 0;
3975         for_each_netdev(net, dev) {
3976                 for (i = 0; i < NPROTO; i++) {
3977                         if (gifconf_list[i]) {
3978                                 int done;
3979                                 if (!pos)
3980                                         done = gifconf_list[i](dev, NULL, 0);
3981                                 else
3982                                         done = gifconf_list[i](dev, pos + total,
3983                                                                len - total);
3984                                 if (done < 0)
3985                                         return -EFAULT;
3986                                 total += done;
3987                         }
3988                 }
3989         }
3990
3991         /*
3992          *      All done.  Write the updated control block back to the caller.
3993          */
3994         ifc.ifc_len = total;
3995
3996         /*
3997          *      Both BSD and Solaris return 0 here, so we do too.
3998          */
3999         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4000 }
4001
4002 #ifdef CONFIG_PROC_FS
4003
4004 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4005
4006 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4007 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4008 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4009
4010 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4011 {
4012         struct net *net = seq_file_net(seq);
4013         struct net_device *dev;
4014         struct hlist_node *p;
4015         struct hlist_head *h;
4016         unsigned int count = 0, offset = get_offset(*pos);
4017
4018         h = &net->dev_name_head[get_bucket(*pos)];
4019         hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4020                 if (++count == offset)
4021                         return dev;
4022         }
4023
4024         return NULL;
4025 }
4026
4027 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4028 {
4029         struct net_device *dev;
4030         unsigned int bucket;
4031
4032         do {
4033                 dev = dev_from_same_bucket(seq, pos);
4034                 if (dev)
4035                         return dev;
4036
4037                 bucket = get_bucket(*pos) + 1;
4038                 *pos = set_bucket_offset(bucket, 1);
4039         } while (bucket < NETDEV_HASHENTRIES);
4040
4041         return NULL;
4042 }
4043
4044 /*
4045  *      This is invoked by the /proc filesystem handler to display a device
4046  *      in detail.
4047  */
4048 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4049         __acquires(RCU)
4050 {
4051         rcu_read_lock();
4052         if (!*pos)
4053                 return SEQ_START_TOKEN;
4054
4055         if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4056                 return NULL;
4057
4058         return dev_from_bucket(seq, pos);
4059 }
4060
4061 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4062 {
4063         ++*pos;
4064         return dev_from_bucket(seq, pos);
4065 }
4066
4067 void dev_seq_stop(struct seq_file *seq, void *v)
4068         __releases(RCU)
4069 {
4070         rcu_read_unlock();
4071 }
4072
4073 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4074 {
4075         struct rtnl_link_stats64 temp;
4076         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4077
4078         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4079                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4080                    dev->name, stats->rx_bytes, stats->rx_packets,
4081                    stats->rx_errors,
4082                    stats->rx_dropped + stats->rx_missed_errors,
4083                    stats->rx_fifo_errors,
4084                    stats->rx_length_errors + stats->rx_over_errors +
4085                     stats->rx_crc_errors + stats->rx_frame_errors,
4086                    stats->rx_compressed, stats->multicast,
4087                    stats->tx_bytes, stats->tx_packets,
4088                    stats->tx_errors, stats->tx_dropped,
4089                    stats->tx_fifo_errors, stats->collisions,
4090                    stats->tx_carrier_errors +
4091                     stats->tx_aborted_errors +
4092                     stats->tx_window_errors +
4093                     stats->tx_heartbeat_errors,
4094                    stats->tx_compressed);
4095 }
4096
4097 /*
4098  *      Called from the PROCfs module. This now uses the new arbitrary sized
4099  *      /proc/net interface to create /proc/net/dev
4100  */
4101 static int dev_seq_show(struct seq_file *seq, void *v)
4102 {
4103         if (v == SEQ_START_TOKEN)
4104                 seq_puts(seq, "Inter-|   Receive                            "
4105                               "                    |  Transmit\n"
4106                               " face |bytes    packets errs drop fifo frame "
4107                               "compressed multicast|bytes    packets errs "
4108                               "drop fifo colls carrier compressed\n");
4109         else
4110                 dev_seq_printf_stats(seq, v);
4111         return 0;
4112 }
4113
4114 static struct softnet_data *softnet_get_online(loff_t *pos)
4115 {
4116         struct softnet_data *sd = NULL;
4117
4118         while (*pos < nr_cpu_ids)
4119                 if (cpu_online(*pos)) {
4120                         sd = &per_cpu(softnet_data, *pos);
4121                         break;
4122                 } else
4123                         ++*pos;
4124         return sd;
4125 }
4126
4127 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4128 {
4129         return softnet_get_online(pos);
4130 }
4131
4132 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4133 {
4134         ++*pos;
4135         return softnet_get_online(pos);
4136 }
4137
4138 static void softnet_seq_stop(struct seq_file *seq, void *v)
4139 {
4140 }
4141
4142 static int softnet_seq_show(struct seq_file *seq, void *v)
4143 {
4144         struct softnet_data *sd = v;
4145
4146         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4147                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4148                    0, 0, 0, 0, /* was fastroute */
4149                    sd->cpu_collision, sd->received_rps);
4150         return 0;
4151 }
4152
4153 static const struct seq_operations dev_seq_ops = {
4154         .start = dev_seq_start,
4155         .next  = dev_seq_next,
4156         .stop  = dev_seq_stop,
4157         .show  = dev_seq_show,
4158 };
4159
4160 static int dev_seq_open(struct inode *inode, struct file *file)
4161 {
4162         return seq_open_net(inode, file, &dev_seq_ops,
4163                             sizeof(struct seq_net_private));
4164 }
4165
4166 static const struct file_operations dev_seq_fops = {
4167         .owner   = THIS_MODULE,
4168         .open    = dev_seq_open,
4169         .read    = seq_read,
4170         .llseek  = seq_lseek,
4171         .release = seq_release_net,
4172 };
4173
4174 static const struct seq_operations softnet_seq_ops = {
4175         .start = softnet_seq_start,
4176         .next  = softnet_seq_next,
4177         .stop  = softnet_seq_stop,
4178         .show  = softnet_seq_show,
4179 };
4180
4181 static int softnet_seq_open(struct inode *inode, struct file *file)
4182 {
4183         return seq_open(file, &softnet_seq_ops);
4184 }
4185
4186 static const struct file_operations softnet_seq_fops = {
4187         .owner   = THIS_MODULE,
4188         .open    = softnet_seq_open,
4189         .read    = seq_read,
4190         .llseek  = seq_lseek,
4191         .release = seq_release,
4192 };
4193
4194 static void *ptype_get_idx(loff_t pos)
4195 {
4196         struct packet_type *pt = NULL;
4197         loff_t i = 0;
4198         int t;
4199
4200         list_for_each_entry_rcu(pt, &ptype_all, list) {
4201                 if (i == pos)
4202                         return pt;
4203                 ++i;
4204         }
4205
4206         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4207                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4208                         if (i == pos)
4209                                 return pt;
4210                         ++i;
4211                 }
4212         }
4213         return NULL;
4214 }
4215
4216 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4217         __acquires(RCU)
4218 {
4219         rcu_read_lock();
4220         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4221 }
4222
4223 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4224 {
4225         struct packet_type *pt;
4226         struct list_head *nxt;
4227         int hash;
4228
4229         ++*pos;
4230         if (v == SEQ_START_TOKEN)
4231                 return ptype_get_idx(0);
4232
4233         pt = v;
4234         nxt = pt->list.next;
4235         if (pt->type == htons(ETH_P_ALL)) {
4236                 if (nxt != &ptype_all)
4237                         goto found;
4238                 hash = 0;
4239                 nxt = ptype_base[0].next;
4240         } else
4241                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4242
4243         while (nxt == &ptype_base[hash]) {
4244                 if (++hash >= PTYPE_HASH_SIZE)
4245                         return NULL;
4246                 nxt = ptype_base[hash].next;
4247         }
4248 found:
4249         return list_entry(nxt, struct packet_type, list);
4250 }
4251
4252 static void ptype_seq_stop(struct seq_file *seq, void *v)
4253         __releases(RCU)
4254 {
4255         rcu_read_unlock();
4256 }
4257
4258 static int ptype_seq_show(struct seq_file *seq, void *v)
4259 {
4260         struct packet_type *pt = v;
4261
4262         if (v == SEQ_START_TOKEN)
4263                 seq_puts(seq, "Type Device      Function\n");
4264         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4265                 if (pt->type == htons(ETH_P_ALL))
4266                         seq_puts(seq, "ALL ");
4267                 else
4268                         seq_printf(seq, "%04x", ntohs(pt->type));
4269
4270                 seq_printf(seq, " %-8s %pF\n",
4271                            pt->dev ? pt->dev->name : "", pt->func);
4272         }
4273
4274         return 0;
4275 }
4276
4277 static const struct seq_operations ptype_seq_ops = {
4278         .start = ptype_seq_start,
4279         .next  = ptype_seq_next,
4280         .stop  = ptype_seq_stop,
4281         .show  = ptype_seq_show,
4282 };
4283
4284 static int ptype_seq_open(struct inode *inode, struct file *file)
4285 {
4286         return seq_open_net(inode, file, &ptype_seq_ops,
4287                         sizeof(struct seq_net_private));
4288 }
4289
4290 static const struct file_operations ptype_seq_fops = {
4291         .owner   = THIS_MODULE,
4292         .open    = ptype_seq_open,
4293         .read    = seq_read,
4294         .llseek  = seq_lseek,
4295         .release = seq_release_net,
4296 };
4297
4298
4299 static int __net_init dev_proc_net_init(struct net *net)
4300 {
4301         int rc = -ENOMEM;
4302
4303         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4304                 goto out;
4305         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4306                 goto out_dev;
4307         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4308                 goto out_softnet;
4309
4310         if (wext_proc_init(net))
4311                 goto out_ptype;
4312         rc = 0;
4313 out:
4314         return rc;
4315 out_ptype:
4316         proc_net_remove(net, "ptype");
4317 out_softnet:
4318         proc_net_remove(net, "softnet_stat");
4319 out_dev:
4320         proc_net_remove(net, "dev");
4321         goto out;
4322 }
4323
4324 static void __net_exit dev_proc_net_exit(struct net *net)
4325 {
4326         wext_proc_exit(net);
4327
4328         proc_net_remove(net, "ptype");
4329         proc_net_remove(net, "softnet_stat");
4330         proc_net_remove(net, "dev");
4331 }
4332
4333 static struct pernet_operations __net_initdata dev_proc_ops = {
4334         .init = dev_proc_net_init,
4335         .exit = dev_proc_net_exit,
4336 };
4337
4338 static int __init dev_proc_init(void)
4339 {
4340         return register_pernet_subsys(&dev_proc_ops);
4341 }
4342 #else
4343 #define dev_proc_init() 0
4344 #endif  /* CONFIG_PROC_FS */
4345
4346
4347 /**
4348  *      netdev_set_master       -       set up master pointer
4349  *      @slave: slave device
4350  *      @master: new master device
4351  *
4352  *      Changes the master device of the slave. Pass %NULL to break the
4353  *      bonding. The caller must hold the RTNL semaphore. On a failure
4354  *      a negative errno code is returned. On success the reference counts
4355  *      are adjusted and the function returns zero.
4356  */
4357 int netdev_set_master(struct net_device *slave, struct net_device *master)
4358 {
4359         struct net_device *old = slave->master;
4360
4361         ASSERT_RTNL();
4362
4363         if (master) {
4364                 if (old)
4365                         return -EBUSY;
4366                 dev_hold(master);
4367         }
4368
4369         slave->master = master;
4370
4371         if (old)
4372                 dev_put(old);
4373         return 0;
4374 }
4375 EXPORT_SYMBOL(netdev_set_master);
4376
4377 /**
4378  *      netdev_set_bond_master  -       set up bonding master/slave pair
4379  *      @slave: slave device
4380  *      @master: new master device
4381  *
4382  *      Changes the master device of the slave. Pass %NULL to break the
4383  *      bonding. The caller must hold the RTNL semaphore. On a failure
4384  *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4385  *      to the routing socket and the function returns zero.
4386  */
4387 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4388 {
4389         int err;
4390
4391         ASSERT_RTNL();
4392
4393         err = netdev_set_master(slave, master);
4394         if (err)
4395                 return err;
4396         if (master)
4397                 slave->flags |= IFF_SLAVE;
4398         else
4399                 slave->flags &= ~IFF_SLAVE;
4400
4401         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4402         return 0;
4403 }
4404 EXPORT_SYMBOL(netdev_set_bond_master);
4405
4406 static void dev_change_rx_flags(struct net_device *dev, int flags)
4407 {
4408         const struct net_device_ops *ops = dev->netdev_ops;
4409
4410         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4411                 ops->ndo_change_rx_flags(dev, flags);
4412 }
4413
4414 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4415 {
4416         unsigned int old_flags = dev->flags;
4417         uid_t uid;
4418         gid_t gid;
4419
4420         ASSERT_RTNL();
4421
4422         dev->flags |= IFF_PROMISC;
4423         dev->promiscuity += inc;
4424         if (dev->promiscuity == 0) {
4425                 /*
4426                  * Avoid overflow.
4427                  * If inc causes overflow, untouch promisc and return error.
4428                  */
4429                 if (inc < 0)
4430                         dev->flags &= ~IFF_PROMISC;
4431                 else {
4432                         dev->promiscuity -= inc;
4433                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4434                                 dev->name);
4435                         return -EOVERFLOW;
4436                 }
4437         }
4438         if (dev->flags != old_flags) {
4439                 pr_info("device %s %s promiscuous mode\n",
4440                         dev->name,
4441                         dev->flags & IFF_PROMISC ? "entered" : "left");
4442                 if (audit_enabled) {
4443                         current_uid_gid(&uid, &gid);
4444                         audit_log(current->audit_context, GFP_ATOMIC,
4445                                 AUDIT_ANOM_PROMISCUOUS,
4446                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4447                                 dev->name, (dev->flags & IFF_PROMISC),
4448                                 (old_flags & IFF_PROMISC),
4449                                 audit_get_loginuid(current),
4450                                 uid, gid,
4451                                 audit_get_sessionid(current));
4452                 }
4453
4454                 dev_change_rx_flags(dev, IFF_PROMISC);
4455         }
4456         return 0;
4457 }
4458
4459 /**
4460  *      dev_set_promiscuity     - update promiscuity count on a device
4461  *      @dev: device
4462  *      @inc: modifier
4463  *
4464  *      Add or remove promiscuity from a device. While the count in the device
4465  *      remains above zero the interface remains promiscuous. Once it hits zero
4466  *      the device reverts back to normal filtering operation. A negative inc
4467  *      value is used to drop promiscuity on the device.
4468  *      Return 0 if successful or a negative errno code on error.
4469  */
4470 int dev_set_promiscuity(struct net_device *dev, int inc)
4471 {
4472         unsigned int old_flags = dev->flags;
4473         int err;
4474
4475         err = __dev_set_promiscuity(dev, inc);
4476         if (err < 0)
4477                 return err;
4478         if (dev->flags != old_flags)
4479                 dev_set_rx_mode(dev);
4480         return err;
4481 }
4482 EXPORT_SYMBOL(dev_set_promiscuity);
4483
4484 /**
4485  *      dev_set_allmulti        - update allmulti count on a device
4486  *      @dev: device
4487  *      @inc: modifier
4488  *
4489  *      Add or remove reception of all multicast frames to a device. While the
4490  *      count in the device remains above zero the interface remains listening
4491  *      to all interfaces. Once it hits zero the device reverts back to normal
4492  *      filtering operation. A negative @inc value is used to drop the counter
4493  *      when releasing a resource needing all multicasts.
4494  *      Return 0 if successful or a negative errno code on error.
4495  */
4496
4497 int dev_set_allmulti(struct net_device *dev, int inc)
4498 {
4499         unsigned int old_flags = dev->flags;
4500
4501         ASSERT_RTNL();
4502
4503         dev->flags |= IFF_ALLMULTI;
4504         dev->allmulti += inc;
4505         if (dev->allmulti == 0) {
4506                 /*
4507                  * Avoid overflow.
4508                  * If inc causes overflow, untouch allmulti and return error.
4509                  */
4510                 if (inc < 0)
4511                         dev->flags &= ~IFF_ALLMULTI;
4512                 else {
4513                         dev->allmulti -= inc;
4514                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4515                                 dev->name);
4516                         return -EOVERFLOW;
4517                 }
4518         }
4519         if (dev->flags ^ old_flags) {
4520                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4521                 dev_set_rx_mode(dev);
4522         }
4523         return 0;
4524 }
4525 EXPORT_SYMBOL(dev_set_allmulti);
4526
4527 /*
4528  *      Upload unicast and multicast address lists to device and
4529  *      configure RX filtering. When the device doesn't support unicast
4530  *      filtering it is put in promiscuous mode while unicast addresses
4531  *      are present.
4532  */
4533 void __dev_set_rx_mode(struct net_device *dev)
4534 {
4535         const struct net_device_ops *ops = dev->netdev_ops;
4536
4537         /* dev_open will call this function so the list will stay sane. */
4538         if (!(dev->flags&IFF_UP))
4539                 return;
4540
4541         if (!netif_device_present(dev))
4542                 return;
4543
4544         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4545                 /* Unicast addresses changes may only happen under the rtnl,
4546                  * therefore calling __dev_set_promiscuity here is safe.
4547                  */
4548                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4549                         __dev_set_promiscuity(dev, 1);
4550                         dev->uc_promisc = true;
4551                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4552                         __dev_set_promiscuity(dev, -1);
4553                         dev->uc_promisc = false;
4554                 }
4555         }
4556
4557         if (ops->ndo_set_rx_mode)
4558                 ops->ndo_set_rx_mode(dev);
4559 }
4560
4561 void dev_set_rx_mode(struct net_device *dev)
4562 {
4563         netif_addr_lock_bh(dev);
4564         __dev_set_rx_mode(dev);
4565         netif_addr_unlock_bh(dev);
4566 }
4567
4568 /**
4569  *      dev_get_flags - get flags reported to userspace
4570  *      @dev: device
4571  *
4572  *      Get the combination of flag bits exported through APIs to userspace.
4573  */
4574 unsigned dev_get_flags(const struct net_device *dev)
4575 {
4576         unsigned flags;
4577
4578         flags = (dev->flags & ~(IFF_PROMISC |
4579                                 IFF_ALLMULTI |
4580                                 IFF_RUNNING |
4581                                 IFF_LOWER_UP |
4582                                 IFF_DORMANT)) |
4583                 (dev->gflags & (IFF_PROMISC |
4584                                 IFF_ALLMULTI));
4585
4586         if (netif_running(dev)) {
4587                 if (netif_oper_up(dev))
4588                         flags |= IFF_RUNNING;
4589                 if (netif_carrier_ok(dev))
4590                         flags |= IFF_LOWER_UP;
4591                 if (netif_dormant(dev))
4592                         flags |= IFF_DORMANT;
4593         }
4594
4595         return flags;
4596 }
4597 EXPORT_SYMBOL(dev_get_flags);
4598
4599 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4600 {
4601         unsigned int old_flags = dev->flags;
4602         int ret;
4603
4604         ASSERT_RTNL();
4605
4606         /*
4607          *      Set the flags on our device.
4608          */
4609
4610         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4611                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4612                                IFF_AUTOMEDIA)) |
4613                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4614                                     IFF_ALLMULTI));
4615
4616         /*
4617          *      Load in the correct multicast list now the flags have changed.
4618          */
4619
4620         if ((old_flags ^ flags) & IFF_MULTICAST)
4621                 dev_change_rx_flags(dev, IFF_MULTICAST);
4622
4623         dev_set_rx_mode(dev);
4624
4625         /*
4626          *      Have we downed the interface. We handle IFF_UP ourselves
4627          *      according to user attempts to set it, rather than blindly
4628          *      setting it.
4629          */
4630
4631         ret = 0;
4632         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4633                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4634
4635                 if (!ret)
4636                         dev_set_rx_mode(dev);
4637         }
4638
4639         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4640                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4641
4642                 dev->gflags ^= IFF_PROMISC;
4643                 dev_set_promiscuity(dev, inc);
4644         }
4645
4646         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4647            is important. Some (broken) drivers set IFF_PROMISC, when
4648            IFF_ALLMULTI is requested not asking us and not reporting.
4649          */
4650         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4651                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4652
4653                 dev->gflags ^= IFF_ALLMULTI;
4654                 dev_set_allmulti(dev, inc);
4655         }
4656
4657         return ret;
4658 }
4659
4660 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4661 {
4662         unsigned int changes = dev->flags ^ old_flags;
4663
4664         if (changes & IFF_UP) {
4665                 if (dev->flags & IFF_UP)
4666                         call_netdevice_notifiers(NETDEV_UP, dev);
4667                 else
4668                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4669         }
4670
4671         if (dev->flags & IFF_UP &&
4672             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4673                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4674 }
4675
4676 /**
4677  *      dev_change_flags - change device settings
4678  *      @dev: device
4679  *      @flags: device state flags
4680  *
4681  *      Change settings on device based state flags. The flags are
4682  *      in the userspace exported format.
4683  */
4684 int dev_change_flags(struct net_device *dev, unsigned int flags)
4685 {
4686         int ret;
4687         unsigned int changes, old_flags = dev->flags;
4688
4689         ret = __dev_change_flags(dev, flags);
4690         if (ret < 0)
4691                 return ret;
4692
4693         changes = old_flags ^ dev->flags;
4694         if (changes)
4695                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4696
4697         __dev_notify_flags(dev, old_flags);
4698         return ret;
4699 }
4700 EXPORT_SYMBOL(dev_change_flags);
4701
4702 /**
4703  *      dev_set_mtu - Change maximum transfer unit
4704  *      @dev: device
4705  *      @new_mtu: new transfer unit
4706  *
4707  *      Change the maximum transfer size of the network device.
4708  */
4709 int dev_set_mtu(struct net_device *dev, int new_mtu)
4710 {
4711         const struct net_device_ops *ops = dev->netdev_ops;
4712         int err;
4713
4714         if (new_mtu == dev->mtu)
4715                 return 0;
4716
4717         /*      MTU must be positive.    */
4718         if (new_mtu < 0)
4719                 return -EINVAL;
4720
4721         if (!netif_device_present(dev))
4722                 return -ENODEV;
4723
4724         err = 0;
4725         if (ops->ndo_change_mtu)
4726                 err = ops->ndo_change_mtu(dev, new_mtu);
4727         else
4728                 dev->mtu = new_mtu;
4729
4730         if (!err && dev->flags & IFF_UP)
4731                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4732         return err;
4733 }
4734 EXPORT_SYMBOL(dev_set_mtu);
4735
4736 /**
4737  *      dev_set_group - Change group this device belongs to
4738  *      @dev: device
4739  *      @new_group: group this device should belong to
4740  */
4741 void dev_set_group(struct net_device *dev, int new_group)
4742 {
4743         dev->group = new_group;
4744 }
4745 EXPORT_SYMBOL(dev_set_group);
4746
4747 /**
4748  *      dev_set_mac_address - Change Media Access Control Address
4749  *      @dev: device
4750  *      @sa: new address
4751  *
4752  *      Change the hardware (MAC) address of the device
4753  */
4754 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4755 {
4756         const struct net_device_ops *ops = dev->netdev_ops;
4757         int err;
4758
4759         if (!ops->ndo_set_mac_address)
4760                 return -EOPNOTSUPP;
4761         if (sa->sa_family != dev->type)
4762                 return -EINVAL;
4763         if (!netif_device_present(dev))
4764                 return -ENODEV;
4765         err = ops->ndo_set_mac_address(dev, sa);
4766         if (!err)
4767                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4768         return err;
4769 }
4770 EXPORT_SYMBOL(dev_set_mac_address);
4771
4772 /*
4773  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4774  */
4775 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4776 {
4777         int err;
4778         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4779
4780         if (!dev)
4781                 return -ENODEV;
4782
4783         switch (cmd) {
4784         case SIOCGIFFLAGS:      /* Get interface flags */
4785                 ifr->ifr_flags = (short) dev_get_flags(dev);
4786                 return 0;
4787
4788         case SIOCGIFMETRIC:     /* Get the metric on the interface
4789                                    (currently unused) */
4790                 ifr->ifr_metric = 0;
4791                 return 0;
4792
4793         case SIOCGIFMTU:        /* Get the MTU of a device */
4794                 ifr->ifr_mtu = dev->mtu;
4795                 return 0;
4796
4797         case SIOCGIFHWADDR:
4798                 if (!dev->addr_len)
4799                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4800                 else
4801                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4802                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4803                 ifr->ifr_hwaddr.sa_family = dev->type;
4804                 return 0;
4805
4806         case SIOCGIFSLAVE:
4807                 err = -EINVAL;
4808                 break;
4809
4810         case SIOCGIFMAP:
4811                 ifr->ifr_map.mem_start = dev->mem_start;
4812                 ifr->ifr_map.mem_end   = dev->mem_end;
4813                 ifr->ifr_map.base_addr = dev->base_addr;
4814                 ifr->ifr_map.irq       = dev->irq;
4815                 ifr->ifr_map.dma       = dev->dma;
4816                 ifr->ifr_map.port      = dev->if_port;
4817                 return 0;
4818
4819         case SIOCGIFINDEX:
4820                 ifr->ifr_ifindex = dev->ifindex;
4821                 return 0;
4822
4823         case SIOCGIFTXQLEN:
4824                 ifr->ifr_qlen = dev->tx_queue_len;
4825                 return 0;
4826
4827         default:
4828                 /* dev_ioctl() should ensure this case
4829                  * is never reached
4830                  */
4831                 WARN_ON(1);
4832                 err = -ENOTTY;
4833                 break;
4834
4835         }
4836         return err;
4837 }
4838
4839 /*
4840  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4841  */
4842 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4843 {
4844         int err;
4845         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4846         const struct net_device_ops *ops;
4847
4848         if (!dev)
4849                 return -ENODEV;
4850
4851         ops = dev->netdev_ops;
4852
4853         switch (cmd) {
4854         case SIOCSIFFLAGS:      /* Set interface flags */
4855                 return dev_change_flags(dev, ifr->ifr_flags);
4856
4857         case SIOCSIFMETRIC:     /* Set the metric on the interface
4858                                    (currently unused) */
4859                 return -EOPNOTSUPP;
4860
4861         case SIOCSIFMTU:        /* Set the MTU of a device */
4862                 return dev_set_mtu(dev, ifr->ifr_mtu);
4863
4864         case SIOCSIFHWADDR:
4865                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4866
4867         case SIOCSIFHWBROADCAST:
4868                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4869                         return -EINVAL;
4870                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4871                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4872                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4873                 return 0;
4874
4875         case SIOCSIFMAP:
4876                 if (ops->ndo_set_config) {
4877                         if (!netif_device_present(dev))
4878                                 return -ENODEV;
4879                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4880                 }
4881                 return -EOPNOTSUPP;
4882
4883         case SIOCADDMULTI:
4884                 if (!ops->ndo_set_rx_mode ||
4885                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4886                         return -EINVAL;
4887                 if (!netif_device_present(dev))
4888                         return -ENODEV;
4889                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4890
4891         case SIOCDELMULTI:
4892                 if (!ops->ndo_set_rx_mode ||
4893                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4894                         return -EINVAL;
4895                 if (!netif_device_present(dev))
4896                         return -ENODEV;
4897                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4898
4899         case SIOCSIFTXQLEN:
4900                 if (ifr->ifr_qlen < 0)
4901                         return -EINVAL;
4902                 dev->tx_queue_len = ifr->ifr_qlen;
4903                 return 0;
4904
4905         case SIOCSIFNAME:
4906                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4907                 return dev_change_name(dev, ifr->ifr_newname);
4908
4909         case SIOCSHWTSTAMP:
4910                 err = net_hwtstamp_validate(ifr);
4911                 if (err)
4912                         return err;
4913                 /* fall through */
4914
4915         /*
4916          *      Unknown or private ioctl
4917          */
4918         default:
4919                 if ((cmd >= SIOCDEVPRIVATE &&
4920                     cmd <= SIOCDEVPRIVATE + 15) ||
4921                     cmd == SIOCBONDENSLAVE ||
4922                     cmd == SIOCBONDRELEASE ||
4923                     cmd == SIOCBONDSETHWADDR ||
4924                     cmd == SIOCBONDSLAVEINFOQUERY ||
4925                     cmd == SIOCBONDINFOQUERY ||
4926                     cmd == SIOCBONDCHANGEACTIVE ||
4927                     cmd == SIOCGMIIPHY ||
4928                     cmd == SIOCGMIIREG ||
4929                     cmd == SIOCSMIIREG ||
4930                     cmd == SIOCBRADDIF ||
4931                     cmd == SIOCBRDELIF ||
4932                     cmd == SIOCSHWTSTAMP ||
4933                     cmd == SIOCWANDEV) {
4934                         err = -EOPNOTSUPP;
4935                         if (ops->ndo_do_ioctl) {
4936                                 if (netif_device_present(dev))
4937                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4938                                 else
4939                                         err = -ENODEV;
4940                         }
4941                 } else
4942                         err = -EINVAL;
4943
4944         }
4945         return err;
4946 }
4947
4948 /*
4949  *      This function handles all "interface"-type I/O control requests. The actual
4950  *      'doing' part of this is dev_ifsioc above.
4951  */
4952
4953 /**
4954  *      dev_ioctl       -       network device ioctl
4955  *      @net: the applicable net namespace
4956  *      @cmd: command to issue
4957  *      @arg: pointer to a struct ifreq in user space
4958  *
4959  *      Issue ioctl functions to devices. This is normally called by the
4960  *      user space syscall interfaces but can sometimes be useful for
4961  *      other purposes. The return value is the return from the syscall if
4962  *      positive or a negative errno code on error.
4963  */
4964
4965 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4966 {
4967         struct ifreq ifr;
4968         int ret;
4969         char *colon;
4970
4971         /* One special case: SIOCGIFCONF takes ifconf argument
4972            and requires shared lock, because it sleeps writing
4973            to user space.
4974          */
4975
4976         if (cmd == SIOCGIFCONF) {
4977                 rtnl_lock();
4978                 ret = dev_ifconf(net, (char __user *) arg);
4979                 rtnl_unlock();
4980                 return ret;
4981         }
4982         if (cmd == SIOCGIFNAME)
4983                 return dev_ifname(net, (struct ifreq __user *)arg);
4984
4985         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4986                 return -EFAULT;
4987
4988         ifr.ifr_name[IFNAMSIZ-1] = 0;
4989
4990         colon = strchr(ifr.ifr_name, ':');
4991         if (colon)
4992                 *colon = 0;
4993
4994         /*
4995          *      See which interface the caller is talking about.
4996          */
4997
4998         switch (cmd) {
4999         /*
5000          *      These ioctl calls:
5001          *      - can be done by all.
5002          *      - atomic and do not require locking.
5003          *      - return a value
5004          */
5005         case SIOCGIFFLAGS:
5006         case SIOCGIFMETRIC:
5007         case SIOCGIFMTU:
5008         case SIOCGIFHWADDR:
5009         case SIOCGIFSLAVE:
5010         case SIOCGIFMAP:
5011         case SIOCGIFINDEX:
5012         case SIOCGIFTXQLEN:
5013                 dev_load(net, ifr.ifr_name);
5014                 rcu_read_lock();
5015                 ret = dev_ifsioc_locked(net, &ifr, cmd);
5016                 rcu_read_unlock();
5017                 if (!ret) {
5018                         if (colon)
5019                                 *colon = ':';
5020                         if (copy_to_user(arg, &ifr,
5021                                          sizeof(struct ifreq)))
5022                                 ret = -EFAULT;
5023                 }
5024                 return ret;
5025
5026         case SIOCETHTOOL:
5027                 dev_load(net, ifr.ifr_name);
5028                 rtnl_lock();
5029                 ret = dev_ethtool(net, &ifr);
5030                 rtnl_unlock();
5031                 if (!ret) {
5032                         if (colon)
5033                                 *colon = ':';
5034                         if (copy_to_user(arg, &ifr,
5035                                          sizeof(struct ifreq)))
5036                                 ret = -EFAULT;
5037                 }
5038                 return ret;
5039
5040         /*
5041          *      These ioctl calls:
5042          *      - require superuser power.
5043          *      - require strict serialization.
5044          *      - return a value
5045          */
5046         case SIOCGMIIPHY:
5047         case SIOCGMIIREG:
5048         case SIOCSIFNAME:
5049                 if (!capable(CAP_NET_ADMIN))
5050                         return -EPERM;
5051                 dev_load(net, ifr.ifr_name);
5052                 rtnl_lock();
5053                 ret = dev_ifsioc(net, &ifr, cmd);
5054                 rtnl_unlock();
5055                 if (!ret) {
5056                         if (colon)
5057                                 *colon = ':';
5058                         if (copy_to_user(arg, &ifr,
5059                                          sizeof(struct ifreq)))
5060                                 ret = -EFAULT;
5061                 }
5062                 return ret;
5063
5064         /*
5065          *      These ioctl calls:
5066          *      - require superuser power.
5067          *      - require strict serialization.
5068          *      - do not return a value
5069          */
5070         case SIOCSIFFLAGS:
5071         case SIOCSIFMETRIC:
5072         case SIOCSIFMTU:
5073         case SIOCSIFMAP:
5074         case SIOCSIFHWADDR:
5075         case SIOCSIFSLAVE:
5076         case SIOCADDMULTI:
5077         case SIOCDELMULTI:
5078         case SIOCSIFHWBROADCAST:
5079         case SIOCSIFTXQLEN:
5080         case SIOCSMIIREG:
5081         case SIOCBONDENSLAVE:
5082         case SIOCBONDRELEASE:
5083         case SIOCBONDSETHWADDR:
5084         case SIOCBONDCHANGEACTIVE:
5085         case SIOCBRADDIF:
5086         case SIOCBRDELIF:
5087         case SIOCSHWTSTAMP:
5088                 if (!capable(CAP_NET_ADMIN))
5089                         return -EPERM;
5090                 /* fall through */
5091         case SIOCBONDSLAVEINFOQUERY:
5092         case SIOCBONDINFOQUERY:
5093                 dev_load(net, ifr.ifr_name);
5094                 rtnl_lock();
5095                 ret = dev_ifsioc(net, &ifr, cmd);
5096                 rtnl_unlock();
5097                 return ret;
5098
5099         case SIOCGIFMEM:
5100                 /* Get the per device memory space. We can add this but
5101                  * currently do not support it */
5102         case SIOCSIFMEM:
5103                 /* Set the per device memory buffer space.
5104                  * Not applicable in our case */
5105         case SIOCSIFLINK:
5106                 return -ENOTTY;
5107
5108         /*
5109          *      Unknown or private ioctl.
5110          */
5111         default:
5112                 if (cmd == SIOCWANDEV ||
5113                     (cmd >= SIOCDEVPRIVATE &&
5114                      cmd <= SIOCDEVPRIVATE + 15)) {
5115                         dev_load(net, ifr.ifr_name);
5116                         rtnl_lock();
5117                         ret = dev_ifsioc(net, &ifr, cmd);
5118                         rtnl_unlock();
5119                         if (!ret && copy_to_user(arg, &ifr,
5120                                                  sizeof(struct ifreq)))
5121                                 ret = -EFAULT;
5122                         return ret;
5123                 }
5124                 /* Take care of Wireless Extensions */
5125                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5126                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5127                 return -ENOTTY;
5128         }
5129 }
5130
5131
5132 /**
5133  *      dev_new_index   -       allocate an ifindex
5134  *      @net: the applicable net namespace
5135  *
5136  *      Returns a suitable unique value for a new device interface
5137  *      number.  The caller must hold the rtnl semaphore or the
5138  *      dev_base_lock to be sure it remains unique.
5139  */
5140 static int dev_new_index(struct net *net)
5141 {
5142         static int ifindex;
5143         for (;;) {
5144                 if (++ifindex <= 0)
5145                         ifindex = 1;
5146                 if (!__dev_get_by_index(net, ifindex))
5147                         return ifindex;
5148         }
5149 }
5150
5151 /* Delayed registration/unregisteration */
5152 static LIST_HEAD(net_todo_list);
5153
5154 static void net_set_todo(struct net_device *dev)
5155 {
5156         list_add_tail(&dev->todo_list, &net_todo_list);
5157 }
5158
5159 static void rollback_registered_many(struct list_head *head)
5160 {
5161         struct net_device *dev, *tmp;
5162
5163         BUG_ON(dev_boot_phase);
5164         ASSERT_RTNL();
5165
5166         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5167                 /* Some devices call without registering
5168                  * for initialization unwind. Remove those
5169                  * devices and proceed with the remaining.
5170                  */
5171                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5172                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5173                                  dev->name, dev);
5174
5175                         WARN_ON(1);
5176                         list_del(&dev->unreg_list);
5177                         continue;
5178                 }
5179                 dev->dismantle = true;
5180                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5181         }
5182
5183         /* If device is running, close it first. */
5184         dev_close_many(head);
5185
5186         list_for_each_entry(dev, head, unreg_list) {
5187                 /* And unlink it from device chain. */
5188                 unlist_netdevice(dev);
5189
5190                 dev->reg_state = NETREG_UNREGISTERING;
5191         }
5192
5193         synchronize_net();
5194
5195         list_for_each_entry(dev, head, unreg_list) {
5196                 /* Shutdown queueing discipline. */
5197                 dev_shutdown(dev);
5198
5199
5200                 /* Notify protocols, that we are about to destroy
5201                    this device. They should clean all the things.
5202                 */
5203                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5204
5205                 if (!dev->rtnl_link_ops ||
5206                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5207                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5208
5209                 /*
5210                  *      Flush the unicast and multicast chains
5211                  */
5212                 dev_uc_flush(dev);
5213                 dev_mc_flush(dev);
5214
5215                 if (dev->netdev_ops->ndo_uninit)
5216                         dev->netdev_ops->ndo_uninit(dev);
5217
5218                 /* Notifier chain MUST detach us from master device. */
5219                 WARN_ON(dev->master);
5220
5221                 /* Remove entries from kobject tree */
5222                 netdev_unregister_kobject(dev);
5223         }
5224
5225         /* Process any work delayed until the end of the batch */
5226         dev = list_first_entry(head, struct net_device, unreg_list);
5227         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5228
5229         synchronize_net();
5230
5231         list_for_each_entry(dev, head, unreg_list)
5232                 dev_put(dev);
5233 }
5234
5235 static void rollback_registered(struct net_device *dev)
5236 {
5237         LIST_HEAD(single);
5238
5239         list_add(&dev->unreg_list, &single);
5240         rollback_registered_many(&single);
5241         list_del(&single);
5242 }
5243
5244 static netdev_features_t netdev_fix_features(struct net_device *dev,
5245         netdev_features_t features)
5246 {
5247         /* Fix illegal checksum combinations */
5248         if ((features & NETIF_F_HW_CSUM) &&
5249             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5250                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5251                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5252         }
5253
5254         /* Fix illegal SG+CSUM combinations. */
5255         if ((features & NETIF_F_SG) &&
5256             !(features & NETIF_F_ALL_CSUM)) {
5257                 netdev_dbg(dev,
5258                         "Dropping NETIF_F_SG since no checksum feature.\n");
5259                 features &= ~NETIF_F_SG;
5260         }
5261
5262         /* TSO requires that SG is present as well. */
5263         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5264                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5265                 features &= ~NETIF_F_ALL_TSO;
5266         }
5267
5268         /* TSO ECN requires that TSO is present as well. */
5269         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5270                 features &= ~NETIF_F_TSO_ECN;
5271
5272         /* Software GSO depends on SG. */
5273         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5274                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5275                 features &= ~NETIF_F_GSO;
5276         }
5277
5278         /* UFO needs SG and checksumming */
5279         if (features & NETIF_F_UFO) {
5280                 /* maybe split UFO into V4 and V6? */
5281                 if (!((features & NETIF_F_GEN_CSUM) ||
5282                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5283                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5284                         netdev_dbg(dev,
5285                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5286                         features &= ~NETIF_F_UFO;
5287                 }
5288
5289                 if (!(features & NETIF_F_SG)) {
5290                         netdev_dbg(dev,
5291                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5292                         features &= ~NETIF_F_UFO;
5293                 }
5294         }
5295
5296         return features;
5297 }
5298
5299 int __netdev_update_features(struct net_device *dev)
5300 {
5301         netdev_features_t features;
5302         int err = 0;
5303
5304         ASSERT_RTNL();
5305
5306         features = netdev_get_wanted_features(dev);
5307
5308         if (dev->netdev_ops->ndo_fix_features)
5309                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5310
5311         /* driver might be less strict about feature dependencies */
5312         features = netdev_fix_features(dev, features);
5313
5314         if (dev->features == features)
5315                 return 0;
5316
5317         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5318                 &dev->features, &features);
5319
5320         if (dev->netdev_ops->ndo_set_features)
5321                 err = dev->netdev_ops->ndo_set_features(dev, features);
5322
5323         if (unlikely(err < 0)) {
5324                 netdev_err(dev,
5325                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5326                         err, &features, &dev->features);
5327                 return -1;
5328         }
5329
5330         if (!err)
5331                 dev->features = features;
5332
5333         return 1;
5334 }
5335
5336 /**
5337  *      netdev_update_features - recalculate device features
5338  *      @dev: the device to check
5339  *
5340  *      Recalculate dev->features set and send notifications if it
5341  *      has changed. Should be called after driver or hardware dependent
5342  *      conditions might have changed that influence the features.
5343  */
5344 void netdev_update_features(struct net_device *dev)
5345 {
5346         if (__netdev_update_features(dev))
5347                 netdev_features_change(dev);
5348 }
5349 EXPORT_SYMBOL(netdev_update_features);
5350
5351 /**
5352  *      netdev_change_features - recalculate device features
5353  *      @dev: the device to check
5354  *
5355  *      Recalculate dev->features set and send notifications even
5356  *      if they have not changed. Should be called instead of
5357  *      netdev_update_features() if also dev->vlan_features might
5358  *      have changed to allow the changes to be propagated to stacked
5359  *      VLAN devices.
5360  */
5361 void netdev_change_features(struct net_device *dev)
5362 {
5363         __netdev_update_features(dev);
5364         netdev_features_change(dev);
5365 }
5366 EXPORT_SYMBOL(netdev_change_features);
5367
5368 /**
5369  *      netif_stacked_transfer_operstate -      transfer operstate
5370  *      @rootdev: the root or lower level device to transfer state from
5371  *      @dev: the device to transfer operstate to
5372  *
5373  *      Transfer operational state from root to device. This is normally
5374  *      called when a stacking relationship exists between the root
5375  *      device and the device(a leaf device).
5376  */
5377 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5378                                         struct net_device *dev)
5379 {
5380         if (rootdev->operstate == IF_OPER_DORMANT)
5381                 netif_dormant_on(dev);
5382         else
5383                 netif_dormant_off(dev);
5384
5385         if (netif_carrier_ok(rootdev)) {
5386                 if (!netif_carrier_ok(dev))
5387                         netif_carrier_on(dev);
5388         } else {
5389                 if (netif_carrier_ok(dev))
5390                         netif_carrier_off(dev);
5391         }
5392 }
5393 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5394
5395 #ifdef CONFIG_RPS
5396 static int netif_alloc_rx_queues(struct net_device *dev)
5397 {
5398         unsigned int i, count = dev->num_rx_queues;
5399         struct netdev_rx_queue *rx;
5400
5401         BUG_ON(count < 1);
5402
5403         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5404         if (!rx) {
5405                 pr_err("netdev: Unable to allocate %u rx queues\n", count);
5406                 return -ENOMEM;
5407         }
5408         dev->_rx = rx;
5409
5410         for (i = 0; i < count; i++)
5411                 rx[i].dev = dev;
5412         return 0;
5413 }
5414 #endif
5415
5416 static void netdev_init_one_queue(struct net_device *dev,
5417                                   struct netdev_queue *queue, void *_unused)
5418 {
5419         /* Initialize queue lock */
5420         spin_lock_init(&queue->_xmit_lock);
5421         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5422         queue->xmit_lock_owner = -1;
5423         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5424         queue->dev = dev;
5425 #ifdef CONFIG_BQL
5426         dql_init(&queue->dql, HZ);
5427 #endif
5428 }
5429
5430 static int netif_alloc_netdev_queues(struct net_device *dev)
5431 {
5432         unsigned int count = dev->num_tx_queues;
5433         struct netdev_queue *tx;
5434
5435         BUG_ON(count < 1);
5436
5437         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5438         if (!tx) {
5439                 pr_err("netdev: Unable to allocate %u tx queues\n", count);
5440                 return -ENOMEM;
5441         }
5442         dev->_tx = tx;
5443
5444         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5445         spin_lock_init(&dev->tx_global_lock);
5446
5447         return 0;
5448 }
5449
5450 /**
5451  *      register_netdevice      - register a network device
5452  *      @dev: device to register
5453  *
5454  *      Take a completed network device structure and add it to the kernel
5455  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5456  *      chain. 0 is returned on success. A negative errno code is returned
5457  *      on a failure to set up the device, or if the name is a duplicate.
5458  *
5459  *      Callers must hold the rtnl semaphore. You may want
5460  *      register_netdev() instead of this.
5461  *
5462  *      BUGS:
5463  *      The locking appears insufficient to guarantee two parallel registers
5464  *      will not get the same name.
5465  */
5466
5467 int register_netdevice(struct net_device *dev)
5468 {
5469         int ret;
5470         struct net *net = dev_net(dev);
5471
5472         BUG_ON(dev_boot_phase);
5473         ASSERT_RTNL();
5474
5475         might_sleep();
5476
5477         /* When net_device's are persistent, this will be fatal. */
5478         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5479         BUG_ON(!net);
5480
5481         spin_lock_init(&dev->addr_list_lock);
5482         netdev_set_addr_lockdep_class(dev);
5483
5484         dev->iflink = -1;
5485
5486         ret = dev_get_valid_name(dev, dev->name);
5487         if (ret < 0)
5488                 goto out;
5489
5490         /* Init, if this function is available */
5491         if (dev->netdev_ops->ndo_init) {
5492                 ret = dev->netdev_ops->ndo_init(dev);
5493                 if (ret) {
5494                         if (ret > 0)
5495                                 ret = -EIO;
5496                         goto out;
5497                 }
5498         }
5499
5500         dev->ifindex = dev_new_index(net);
5501         if (dev->iflink == -1)
5502                 dev->iflink = dev->ifindex;
5503
5504         /* Transfer changeable features to wanted_features and enable
5505          * software offloads (GSO and GRO).
5506          */
5507         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5508         dev->features |= NETIF_F_SOFT_FEATURES;
5509         dev->wanted_features = dev->features & dev->hw_features;
5510
5511         /* Turn on no cache copy if HW is doing checksum */
5512         if (!(dev->flags & IFF_LOOPBACK)) {
5513                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5514                 if (dev->features & NETIF_F_ALL_CSUM) {
5515                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5516                         dev->features |= NETIF_F_NOCACHE_COPY;
5517                 }
5518         }
5519
5520         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5521          */
5522         dev->vlan_features |= NETIF_F_HIGHDMA;
5523
5524         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5525         ret = notifier_to_errno(ret);
5526         if (ret)
5527                 goto err_uninit;
5528
5529         ret = netdev_register_kobject(dev);
5530         if (ret)
5531                 goto err_uninit;
5532         dev->reg_state = NETREG_REGISTERED;
5533
5534         __netdev_update_features(dev);
5535
5536         /*
5537          *      Default initial state at registry is that the
5538          *      device is present.
5539          */
5540
5541         set_bit(__LINK_STATE_PRESENT, &dev->state);
5542
5543         dev_init_scheduler(dev);
5544         dev_hold(dev);
5545         list_netdevice(dev);
5546
5547         /* Notify protocols, that a new device appeared. */
5548         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5549         ret = notifier_to_errno(ret);
5550         if (ret) {
5551                 rollback_registered(dev);
5552                 dev->reg_state = NETREG_UNREGISTERED;
5553         }
5554         /*
5555          *      Prevent userspace races by waiting until the network
5556          *      device is fully setup before sending notifications.
5557          */
5558         if (!dev->rtnl_link_ops ||
5559             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5560                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5561
5562 out:
5563         return ret;
5564
5565 err_uninit:
5566         if (dev->netdev_ops->ndo_uninit)
5567                 dev->netdev_ops->ndo_uninit(dev);
5568         goto out;
5569 }
5570 EXPORT_SYMBOL(register_netdevice);
5571
5572 /**
5573  *      init_dummy_netdev       - init a dummy network device for NAPI
5574  *      @dev: device to init
5575  *
5576  *      This takes a network device structure and initialize the minimum
5577  *      amount of fields so it can be used to schedule NAPI polls without
5578  *      registering a full blown interface. This is to be used by drivers
5579  *      that need to tie several hardware interfaces to a single NAPI
5580  *      poll scheduler due to HW limitations.
5581  */
5582 int init_dummy_netdev(struct net_device *dev)
5583 {
5584         /* Clear everything. Note we don't initialize spinlocks
5585          * are they aren't supposed to be taken by any of the
5586          * NAPI code and this dummy netdev is supposed to be
5587          * only ever used for NAPI polls
5588          */
5589         memset(dev, 0, sizeof(struct net_device));
5590
5591         /* make sure we BUG if trying to hit standard
5592          * register/unregister code path
5593          */
5594         dev->reg_state = NETREG_DUMMY;
5595
5596         /* NAPI wants this */
5597         INIT_LIST_HEAD(&dev->napi_list);
5598
5599         /* a dummy interface is started by default */
5600         set_bit(__LINK_STATE_PRESENT, &dev->state);
5601         set_bit(__LINK_STATE_START, &dev->state);
5602
5603         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5604          * because users of this 'device' dont need to change
5605          * its refcount.
5606          */
5607
5608         return 0;
5609 }
5610 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5611
5612
5613 /**
5614  *      register_netdev - register a network device
5615  *      @dev: device to register
5616  *
5617  *      Take a completed network device structure and add it to the kernel
5618  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5619  *      chain. 0 is returned on success. A negative errno code is returned
5620  *      on a failure to set up the device, or if the name is a duplicate.
5621  *
5622  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5623  *      and expands the device name if you passed a format string to
5624  *      alloc_netdev.
5625  */
5626 int register_netdev(struct net_device *dev)
5627 {
5628         int err;
5629
5630         rtnl_lock();
5631         err = register_netdevice(dev);
5632         rtnl_unlock();
5633         return err;
5634 }
5635 EXPORT_SYMBOL(register_netdev);
5636
5637 int netdev_refcnt_read(const struct net_device *dev)
5638 {
5639         int i, refcnt = 0;
5640
5641         for_each_possible_cpu(i)
5642                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5643         return refcnt;
5644 }
5645 EXPORT_SYMBOL(netdev_refcnt_read);
5646
5647 /*
5648  * netdev_wait_allrefs - wait until all references are gone.
5649  *
5650  * This is called when unregistering network devices.
5651  *
5652  * Any protocol or device that holds a reference should register
5653  * for netdevice notification, and cleanup and put back the
5654  * reference if they receive an UNREGISTER event.
5655  * We can get stuck here if buggy protocols don't correctly
5656  * call dev_put.
5657  */
5658 static void netdev_wait_allrefs(struct net_device *dev)
5659 {
5660         unsigned long rebroadcast_time, warning_time;
5661         int refcnt;
5662
5663         linkwatch_forget_dev(dev);
5664
5665         rebroadcast_time = warning_time = jiffies;
5666         refcnt = netdev_refcnt_read(dev);
5667
5668         while (refcnt != 0) {
5669                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5670                         rtnl_lock();
5671
5672                         /* Rebroadcast unregister notification */
5673                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5674                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5675                          * should have already handle it the first time */
5676
5677                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5678                                      &dev->state)) {
5679                                 /* We must not have linkwatch events
5680                                  * pending on unregister. If this
5681                                  * happens, we simply run the queue
5682                                  * unscheduled, resulting in a noop
5683                                  * for this device.
5684                                  */
5685                                 linkwatch_run_queue();
5686                         }
5687
5688                         __rtnl_unlock();
5689
5690                         rebroadcast_time = jiffies;
5691                 }
5692
5693                 msleep(250);
5694
5695                 refcnt = netdev_refcnt_read(dev);
5696
5697                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5698                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5699                                  dev->name, refcnt);
5700                         warning_time = jiffies;
5701                 }
5702         }
5703 }
5704
5705 /* The sequence is:
5706  *
5707  *      rtnl_lock();
5708  *      ...
5709  *      register_netdevice(x1);
5710  *      register_netdevice(x2);
5711  *      ...
5712  *      unregister_netdevice(y1);
5713  *      unregister_netdevice(y2);
5714  *      ...
5715  *      rtnl_unlock();
5716  *      free_netdev(y1);
5717  *      free_netdev(y2);
5718  *
5719  * We are invoked by rtnl_unlock().
5720  * This allows us to deal with problems:
5721  * 1) We can delete sysfs objects which invoke hotplug
5722  *    without deadlocking with linkwatch via keventd.
5723  * 2) Since we run with the RTNL semaphore not held, we can sleep
5724  *    safely in order to wait for the netdev refcnt to drop to zero.
5725  *
5726  * We must not return until all unregister events added during
5727  * the interval the lock was held have been completed.
5728  */
5729 void netdev_run_todo(void)
5730 {
5731         struct list_head list;
5732
5733         /* Snapshot list, allow later requests */
5734         list_replace_init(&net_todo_list, &list);
5735
5736         __rtnl_unlock();
5737
5738         /* Wait for rcu callbacks to finish before attempting to drain
5739          * the device list.  This usually avoids a 250ms wait.
5740          */
5741         if (!list_empty(&list))
5742                 rcu_barrier();
5743
5744         while (!list_empty(&list)) {
5745                 struct net_device *dev
5746                         = list_first_entry(&list, struct net_device, todo_list);
5747                 list_del(&dev->todo_list);
5748
5749                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5750                         pr_err("network todo '%s' but state %d\n",
5751                                dev->name, dev->reg_state);
5752                         dump_stack();
5753                         continue;
5754                 }
5755
5756                 dev->reg_state = NETREG_UNREGISTERED;
5757
5758                 on_each_cpu(flush_backlog, dev, 1);
5759
5760                 netdev_wait_allrefs(dev);
5761
5762                 /* paranoia */
5763                 BUG_ON(netdev_refcnt_read(dev));
5764                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5765                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5766                 WARN_ON(dev->dn_ptr);
5767
5768                 if (dev->destructor)
5769                         dev->destructor(dev);
5770
5771                 /* Free network device */
5772                 kobject_put(&dev->dev.kobj);
5773         }
5774 }
5775
5776 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5777  * fields in the same order, with only the type differing.
5778  */
5779 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5780                              const struct net_device_stats *netdev_stats)
5781 {
5782 #if BITS_PER_LONG == 64
5783         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5784         memcpy(stats64, netdev_stats, sizeof(*stats64));
5785 #else
5786         size_t i, n = sizeof(*stats64) / sizeof(u64);
5787         const unsigned long *src = (const unsigned long *)netdev_stats;
5788         u64 *dst = (u64 *)stats64;
5789
5790         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5791                      sizeof(*stats64) / sizeof(u64));
5792         for (i = 0; i < n; i++)
5793                 dst[i] = src[i];
5794 #endif
5795 }
5796 EXPORT_SYMBOL(netdev_stats_to_stats64);
5797
5798 /**
5799  *      dev_get_stats   - get network device statistics
5800  *      @dev: device to get statistics from
5801  *      @storage: place to store stats
5802  *
5803  *      Get network statistics from device. Return @storage.
5804  *      The device driver may provide its own method by setting
5805  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5806  *      otherwise the internal statistics structure is used.
5807  */
5808 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5809                                         struct rtnl_link_stats64 *storage)
5810 {
5811         const struct net_device_ops *ops = dev->netdev_ops;
5812
5813         if (ops->ndo_get_stats64) {
5814                 memset(storage, 0, sizeof(*storage));
5815                 ops->ndo_get_stats64(dev, storage);
5816         } else if (ops->ndo_get_stats) {
5817                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5818         } else {
5819                 netdev_stats_to_stats64(storage, &dev->stats);
5820         }
5821         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5822         return storage;
5823 }
5824 EXPORT_SYMBOL(dev_get_stats);
5825
5826 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5827 {
5828         struct netdev_queue *queue = dev_ingress_queue(dev);
5829
5830 #ifdef CONFIG_NET_CLS_ACT
5831         if (queue)
5832                 return queue;
5833         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5834         if (!queue)
5835                 return NULL;
5836         netdev_init_one_queue(dev, queue, NULL);
5837         queue->qdisc = &noop_qdisc;
5838         queue->qdisc_sleeping = &noop_qdisc;
5839         rcu_assign_pointer(dev->ingress_queue, queue);
5840 #endif
5841         return queue;
5842 }
5843
5844 /**
5845  *      alloc_netdev_mqs - allocate network device
5846  *      @sizeof_priv:   size of private data to allocate space for
5847  *      @name:          device name format string
5848  *      @setup:         callback to initialize device
5849  *      @txqs:          the number of TX subqueues to allocate
5850  *      @rxqs:          the number of RX subqueues to allocate
5851  *
5852  *      Allocates a struct net_device with private data area for driver use
5853  *      and performs basic initialization.  Also allocates subquue structs
5854  *      for each queue on the device.
5855  */
5856 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5857                 void (*setup)(struct net_device *),
5858                 unsigned int txqs, unsigned int rxqs)
5859 {
5860         struct net_device *dev;
5861         size_t alloc_size;
5862         struct net_device *p;
5863
5864         BUG_ON(strlen(name) >= sizeof(dev->name));
5865
5866         if (txqs < 1) {
5867                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5868                 return NULL;
5869         }
5870
5871 #ifdef CONFIG_RPS
5872         if (rxqs < 1) {
5873                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5874                 return NULL;
5875         }
5876 #endif
5877
5878         alloc_size = sizeof(struct net_device);
5879         if (sizeof_priv) {
5880                 /* ensure 32-byte alignment of private area */
5881                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5882                 alloc_size += sizeof_priv;
5883         }
5884         /* ensure 32-byte alignment of whole construct */
5885         alloc_size += NETDEV_ALIGN - 1;
5886
5887         p = kzalloc(alloc_size, GFP_KERNEL);
5888         if (!p) {
5889                 pr_err("alloc_netdev: Unable to allocate device\n");
5890                 return NULL;
5891         }
5892
5893         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5894         dev->padded = (char *)dev - (char *)p;
5895
5896         dev->pcpu_refcnt = alloc_percpu(int);
5897         if (!dev->pcpu_refcnt)
5898                 goto free_p;
5899
5900         if (dev_addr_init(dev))
5901                 goto free_pcpu;
5902
5903         dev_mc_init(dev);
5904         dev_uc_init(dev);
5905
5906         dev_net_set(dev, &init_net);
5907
5908         dev->gso_max_size = GSO_MAX_SIZE;
5909
5910         INIT_LIST_HEAD(&dev->napi_list);
5911         INIT_LIST_HEAD(&dev->unreg_list);
5912         INIT_LIST_HEAD(&dev->link_watch_list);
5913         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5914         setup(dev);
5915
5916         dev->num_tx_queues = txqs;
5917         dev->real_num_tx_queues = txqs;
5918         if (netif_alloc_netdev_queues(dev))
5919                 goto free_all;
5920
5921 #ifdef CONFIG_RPS
5922         dev->num_rx_queues = rxqs;
5923         dev->real_num_rx_queues = rxqs;
5924         if (netif_alloc_rx_queues(dev))
5925                 goto free_all;
5926 #endif
5927
5928         strcpy(dev->name, name);
5929         dev->group = INIT_NETDEV_GROUP;
5930         return dev;
5931
5932 free_all:
5933         free_netdev(dev);
5934         return NULL;
5935
5936 free_pcpu:
5937         free_percpu(dev->pcpu_refcnt);
5938         kfree(dev->_tx);
5939 #ifdef CONFIG_RPS
5940         kfree(dev->_rx);
5941 #endif
5942
5943 free_p:
5944         kfree(p);
5945         return NULL;
5946 }
5947 EXPORT_SYMBOL(alloc_netdev_mqs);
5948
5949 /**
5950  *      free_netdev - free network device
5951  *      @dev: device
5952  *
5953  *      This function does the last stage of destroying an allocated device
5954  *      interface. The reference to the device object is released.
5955  *      If this is the last reference then it will be freed.
5956  */
5957 void free_netdev(struct net_device *dev)
5958 {
5959         struct napi_struct *p, *n;
5960
5961         release_net(dev_net(dev));
5962
5963         kfree(dev->_tx);
5964 #ifdef CONFIG_RPS
5965         kfree(dev->_rx);
5966 #endif
5967
5968         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5969
5970         /* Flush device addresses */
5971         dev_addr_flush(dev);
5972
5973         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5974                 netif_napi_del(p);
5975
5976         free_percpu(dev->pcpu_refcnt);
5977         dev->pcpu_refcnt = NULL;
5978
5979         /*  Compatibility with error handling in drivers */
5980         if (dev->reg_state == NETREG_UNINITIALIZED) {
5981                 kfree((char *)dev - dev->padded);
5982                 return;
5983         }
5984
5985         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5986         dev->reg_state = NETREG_RELEASED;
5987
5988         /* will free via device release */
5989         put_device(&dev->dev);
5990 }
5991 EXPORT_SYMBOL(free_netdev);
5992
5993 /**
5994  *      synchronize_net -  Synchronize with packet receive processing
5995  *
5996  *      Wait for packets currently being received to be done.
5997  *      Does not block later packets from starting.
5998  */
5999 void synchronize_net(void)
6000 {
6001         might_sleep();
6002         if (rtnl_is_locked())
6003                 synchronize_rcu_expedited();
6004         else
6005                 synchronize_rcu();
6006 }
6007 EXPORT_SYMBOL(synchronize_net);
6008
6009 /**
6010  *      unregister_netdevice_queue - remove device from the kernel
6011  *      @dev: device
6012  *      @head: list
6013  *
6014  *      This function shuts down a device interface and removes it
6015  *      from the kernel tables.
6016  *      If head not NULL, device is queued to be unregistered later.
6017  *
6018  *      Callers must hold the rtnl semaphore.  You may want
6019  *      unregister_netdev() instead of this.
6020  */
6021
6022 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6023 {
6024         ASSERT_RTNL();
6025
6026         if (head) {
6027                 list_move_tail(&dev->unreg_list, head);
6028         } else {
6029                 rollback_registered(dev);
6030                 /* Finish processing unregister after unlock */
6031                 net_set_todo(dev);
6032         }
6033 }
6034 EXPORT_SYMBOL(unregister_netdevice_queue);
6035
6036 /**
6037  *      unregister_netdevice_many - unregister many devices
6038  *      @head: list of devices
6039  */
6040 void unregister_netdevice_many(struct list_head *head)
6041 {
6042         struct net_device *dev;
6043
6044         if (!list_empty(head)) {
6045                 rollback_registered_many(head);
6046                 list_for_each_entry(dev, head, unreg_list)
6047                         net_set_todo(dev);
6048         }
6049 }
6050 EXPORT_SYMBOL(unregister_netdevice_many);
6051
6052 /**
6053  *      unregister_netdev - remove device from the kernel
6054  *      @dev: device
6055  *
6056  *      This function shuts down a device interface and removes it
6057  *      from the kernel tables.
6058  *
6059  *      This is just a wrapper for unregister_netdevice that takes
6060  *      the rtnl semaphore.  In general you want to use this and not
6061  *      unregister_netdevice.
6062  */
6063 void unregister_netdev(struct net_device *dev)
6064 {
6065         rtnl_lock();
6066         unregister_netdevice(dev);
6067         rtnl_unlock();
6068 }
6069 EXPORT_SYMBOL(unregister_netdev);
6070
6071 /**
6072  *      dev_change_net_namespace - move device to different nethost namespace
6073  *      @dev: device
6074  *      @net: network namespace
6075  *      @pat: If not NULL name pattern to try if the current device name
6076  *            is already taken in the destination network namespace.
6077  *
6078  *      This function shuts down a device interface and moves it
6079  *      to a new network namespace. On success 0 is returned, on
6080  *      a failure a netagive errno code is returned.
6081  *
6082  *      Callers must hold the rtnl semaphore.
6083  */
6084
6085 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6086 {
6087         int err;
6088
6089         ASSERT_RTNL();
6090
6091         /* Don't allow namespace local devices to be moved. */
6092         err = -EINVAL;
6093         if (dev->features & NETIF_F_NETNS_LOCAL)
6094                 goto out;
6095
6096         /* Ensure the device has been registrered */
6097         err = -EINVAL;
6098         if (dev->reg_state != NETREG_REGISTERED)
6099                 goto out;
6100
6101         /* Get out if there is nothing todo */
6102         err = 0;
6103         if (net_eq(dev_net(dev), net))
6104                 goto out;
6105
6106         /* Pick the destination device name, and ensure
6107          * we can use it in the destination network namespace.
6108          */
6109         err = -EEXIST;
6110         if (__dev_get_by_name(net, dev->name)) {
6111                 /* We get here if we can't use the current device name */
6112                 if (!pat)
6113                         goto out;
6114                 if (dev_get_valid_name(dev, pat) < 0)
6115                         goto out;
6116         }
6117
6118         /*
6119          * And now a mini version of register_netdevice unregister_netdevice.
6120          */
6121
6122         /* If device is running close it first. */
6123         dev_close(dev);
6124
6125         /* And unlink it from device chain */
6126         err = -ENODEV;
6127         unlist_netdevice(dev);
6128
6129         synchronize_net();
6130
6131         /* Shutdown queueing discipline. */
6132         dev_shutdown(dev);
6133
6134         /* Notify protocols, that we are about to destroy
6135            this device. They should clean all the things.
6136
6137            Note that dev->reg_state stays at NETREG_REGISTERED.
6138            This is wanted because this way 8021q and macvlan know
6139            the device is just moving and can keep their slaves up.
6140         */
6141         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6142         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6143         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6144
6145         /*
6146          *      Flush the unicast and multicast chains
6147          */
6148         dev_uc_flush(dev);
6149         dev_mc_flush(dev);
6150
6151         /* Actually switch the network namespace */
6152         dev_net_set(dev, net);
6153
6154         /* If there is an ifindex conflict assign a new one */
6155         if (__dev_get_by_index(net, dev->ifindex)) {
6156                 int iflink = (dev->iflink == dev->ifindex);
6157                 dev->ifindex = dev_new_index(net);
6158                 if (iflink)
6159                         dev->iflink = dev->ifindex;
6160         }
6161
6162         /* Fixup kobjects */
6163         err = device_rename(&dev->dev, dev->name);
6164         WARN_ON(err);
6165
6166         /* Add the device back in the hashes */
6167         list_netdevice(dev);
6168
6169         /* Notify protocols, that a new device appeared. */
6170         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6171
6172         /*
6173          *      Prevent userspace races by waiting until the network
6174          *      device is fully setup before sending notifications.
6175          */
6176         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6177
6178         synchronize_net();
6179         err = 0;
6180 out:
6181         return err;
6182 }
6183 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6184
6185 static int dev_cpu_callback(struct notifier_block *nfb,
6186                             unsigned long action,
6187                             void *ocpu)
6188 {
6189         struct sk_buff **list_skb;
6190         struct sk_buff *skb;
6191         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6192         struct softnet_data *sd, *oldsd;
6193
6194         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6195                 return NOTIFY_OK;
6196
6197         local_irq_disable();
6198         cpu = smp_processor_id();
6199         sd = &per_cpu(softnet_data, cpu);
6200         oldsd = &per_cpu(softnet_data, oldcpu);
6201
6202         /* Find end of our completion_queue. */
6203         list_skb = &sd->completion_queue;
6204         while (*list_skb)
6205                 list_skb = &(*list_skb)->next;
6206         /* Append completion queue from offline CPU. */
6207         *list_skb = oldsd->completion_queue;
6208         oldsd->completion_queue = NULL;
6209
6210         /* Append output queue from offline CPU. */
6211         if (oldsd->output_queue) {
6212                 *sd->output_queue_tailp = oldsd->output_queue;
6213                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6214                 oldsd->output_queue = NULL;
6215                 oldsd->output_queue_tailp = &oldsd->output_queue;
6216         }
6217         /* Append NAPI poll list from offline CPU. */
6218         if (!list_empty(&oldsd->poll_list)) {
6219                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6220                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6221         }
6222
6223         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6224         local_irq_enable();
6225
6226         /* Process offline CPU's input_pkt_queue */
6227         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6228                 netif_rx(skb);
6229                 input_queue_head_incr(oldsd);
6230         }
6231         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6232                 netif_rx(skb);
6233                 input_queue_head_incr(oldsd);
6234         }
6235
6236         return NOTIFY_OK;
6237 }
6238
6239
6240 /**
6241  *      netdev_increment_features - increment feature set by one
6242  *      @all: current feature set
6243  *      @one: new feature set
6244  *      @mask: mask feature set
6245  *
6246  *      Computes a new feature set after adding a device with feature set
6247  *      @one to the master device with current feature set @all.  Will not
6248  *      enable anything that is off in @mask. Returns the new feature set.
6249  */
6250 netdev_features_t netdev_increment_features(netdev_features_t all,
6251         netdev_features_t one, netdev_features_t mask)
6252 {
6253         if (mask & NETIF_F_GEN_CSUM)
6254                 mask |= NETIF_F_ALL_CSUM;
6255         mask |= NETIF_F_VLAN_CHALLENGED;
6256
6257         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6258         all &= one | ~NETIF_F_ALL_FOR_ALL;
6259
6260         /* If one device supports hw checksumming, set for all. */
6261         if (all & NETIF_F_GEN_CSUM)
6262                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6263
6264         return all;
6265 }
6266 EXPORT_SYMBOL(netdev_increment_features);
6267
6268 static struct hlist_head *netdev_create_hash(void)
6269 {
6270         int i;
6271         struct hlist_head *hash;
6272
6273         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6274         if (hash != NULL)
6275                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6276                         INIT_HLIST_HEAD(&hash[i]);
6277
6278         return hash;
6279 }
6280
6281 /* Initialize per network namespace state */
6282 static int __net_init netdev_init(struct net *net)
6283 {
6284         INIT_LIST_HEAD(&net->dev_base_head);
6285
6286         net->dev_name_head = netdev_create_hash();
6287         if (net->dev_name_head == NULL)
6288                 goto err_name;
6289
6290         net->dev_index_head = netdev_create_hash();
6291         if (net->dev_index_head == NULL)
6292                 goto err_idx;
6293
6294         return 0;
6295
6296 err_idx:
6297         kfree(net->dev_name_head);
6298 err_name:
6299         return -ENOMEM;
6300 }
6301
6302 /**
6303  *      netdev_drivername - network driver for the device
6304  *      @dev: network device
6305  *
6306  *      Determine network driver for device.
6307  */
6308 const char *netdev_drivername(const struct net_device *dev)
6309 {
6310         const struct device_driver *driver;
6311         const struct device *parent;
6312         const char *empty = "";
6313
6314         parent = dev->dev.parent;
6315         if (!parent)
6316                 return empty;
6317
6318         driver = parent->driver;
6319         if (driver && driver->name)
6320                 return driver->name;
6321         return empty;
6322 }
6323
6324 int __netdev_printk(const char *level, const struct net_device *dev,
6325                            struct va_format *vaf)
6326 {
6327         int r;
6328
6329         if (dev && dev->dev.parent)
6330                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6331                                netdev_name(dev), vaf);
6332         else if (dev)
6333                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6334         else
6335                 r = printk("%s(NULL net_device): %pV", level, vaf);
6336
6337         return r;
6338 }
6339 EXPORT_SYMBOL(__netdev_printk);
6340
6341 int netdev_printk(const char *level, const struct net_device *dev,
6342                   const char *format, ...)
6343 {
6344         struct va_format vaf;
6345         va_list args;
6346         int r;
6347
6348         va_start(args, format);
6349
6350         vaf.fmt = format;
6351         vaf.va = &args;
6352
6353         r = __netdev_printk(level, dev, &vaf);
6354         va_end(args);
6355
6356         return r;
6357 }
6358 EXPORT_SYMBOL(netdev_printk);
6359
6360 #define define_netdev_printk_level(func, level)                 \
6361 int func(const struct net_device *dev, const char *fmt, ...)    \
6362 {                                                               \
6363         int r;                                                  \
6364         struct va_format vaf;                                   \
6365         va_list args;                                           \
6366                                                                 \
6367         va_start(args, fmt);                                    \
6368                                                                 \
6369         vaf.fmt = fmt;                                          \
6370         vaf.va = &args;                                         \
6371                                                                 \
6372         r = __netdev_printk(level, dev, &vaf);                  \
6373         va_end(args);                                           \
6374                                                                 \
6375         return r;                                               \
6376 }                                                               \
6377 EXPORT_SYMBOL(func);
6378
6379 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6380 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6381 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6382 define_netdev_printk_level(netdev_err, KERN_ERR);
6383 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6384 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6385 define_netdev_printk_level(netdev_info, KERN_INFO);
6386
6387 static void __net_exit netdev_exit(struct net *net)
6388 {
6389         kfree(net->dev_name_head);
6390         kfree(net->dev_index_head);
6391 }
6392
6393 static struct pernet_operations __net_initdata netdev_net_ops = {
6394         .init = netdev_init,
6395         .exit = netdev_exit,
6396 };
6397
6398 static void __net_exit default_device_exit(struct net *net)
6399 {
6400         struct net_device *dev, *aux;
6401         /*
6402          * Push all migratable network devices back to the
6403          * initial network namespace
6404          */
6405         rtnl_lock();
6406         for_each_netdev_safe(net, dev, aux) {
6407                 int err;
6408                 char fb_name[IFNAMSIZ];
6409
6410                 /* Ignore unmoveable devices (i.e. loopback) */
6411                 if (dev->features & NETIF_F_NETNS_LOCAL)
6412                         continue;
6413
6414                 /* Leave virtual devices for the generic cleanup */
6415                 if (dev->rtnl_link_ops)
6416                         continue;
6417
6418                 /* Push remaining network devices to init_net */
6419                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6420                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6421                 if (err) {
6422                         pr_emerg("%s: failed to move %s to init_net: %d\n",
6423                                  __func__, dev->name, err);
6424                         BUG();
6425                 }
6426         }
6427         rtnl_unlock();
6428 }
6429
6430 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6431 {
6432         /* At exit all network devices most be removed from a network
6433          * namespace.  Do this in the reverse order of registration.
6434          * Do this across as many network namespaces as possible to
6435          * improve batching efficiency.
6436          */
6437         struct net_device *dev;
6438         struct net *net;
6439         LIST_HEAD(dev_kill_list);
6440
6441         rtnl_lock();
6442         list_for_each_entry(net, net_list, exit_list) {
6443                 for_each_netdev_reverse(net, dev) {
6444                         if (dev->rtnl_link_ops)
6445                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6446                         else
6447                                 unregister_netdevice_queue(dev, &dev_kill_list);
6448                 }
6449         }
6450         unregister_netdevice_many(&dev_kill_list);
6451         list_del(&dev_kill_list);
6452         rtnl_unlock();
6453 }
6454
6455 static struct pernet_operations __net_initdata default_device_ops = {
6456         .exit = default_device_exit,
6457         .exit_batch = default_device_exit_batch,
6458 };
6459
6460 /*
6461  *      Initialize the DEV module. At boot time this walks the device list and
6462  *      unhooks any devices that fail to initialise (normally hardware not
6463  *      present) and leaves us with a valid list of present and active devices.
6464  *
6465  */
6466
6467 /*
6468  *       This is called single threaded during boot, so no need
6469  *       to take the rtnl semaphore.
6470  */
6471 static int __init net_dev_init(void)
6472 {
6473         int i, rc = -ENOMEM;
6474
6475         BUG_ON(!dev_boot_phase);
6476
6477         if (dev_proc_init())
6478                 goto out;
6479
6480         if (netdev_kobject_init())
6481                 goto out;
6482
6483         INIT_LIST_HEAD(&ptype_all);
6484         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6485                 INIT_LIST_HEAD(&ptype_base[i]);
6486
6487         if (register_pernet_subsys(&netdev_net_ops))
6488                 goto out;
6489
6490         /*
6491          *      Initialise the packet receive queues.
6492          */
6493
6494         for_each_possible_cpu(i) {
6495                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6496
6497                 memset(sd, 0, sizeof(*sd));
6498                 skb_queue_head_init(&sd->input_pkt_queue);
6499                 skb_queue_head_init(&sd->process_queue);
6500                 sd->completion_queue = NULL;
6501                 INIT_LIST_HEAD(&sd->poll_list);
6502                 sd->output_queue = NULL;
6503                 sd->output_queue_tailp = &sd->output_queue;
6504 #ifdef CONFIG_RPS
6505                 sd->csd.func = rps_trigger_softirq;
6506                 sd->csd.info = sd;
6507                 sd->csd.flags = 0;
6508                 sd->cpu = i;
6509 #endif
6510
6511                 sd->backlog.poll = process_backlog;
6512                 sd->backlog.weight = weight_p;
6513                 sd->backlog.gro_list = NULL;
6514                 sd->backlog.gro_count = 0;
6515         }
6516
6517         dev_boot_phase = 0;
6518
6519         /* The loopback device is special if any other network devices
6520          * is present in a network namespace the loopback device must
6521          * be present. Since we now dynamically allocate and free the
6522          * loopback device ensure this invariant is maintained by
6523          * keeping the loopback device as the first device on the
6524          * list of network devices.  Ensuring the loopback devices
6525          * is the first device that appears and the last network device
6526          * that disappears.
6527          */
6528         if (register_pernet_device(&loopback_net_ops))
6529                 goto out;
6530
6531         if (register_pernet_device(&default_device_ops))
6532                 goto out;
6533
6534         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6535         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6536
6537         hotcpu_notifier(dev_cpu_callback, 0);
6538         dst_init();
6539         dev_mcast_init();
6540         rc = 0;
6541 out:
6542         return rc;
6543 }
6544
6545 subsys_initcall(net_dev_init);
6546
6547 static int __init initialize_hashrnd(void)
6548 {
6549         get_random_bytes(&hashrnd, sizeof(hashrnd));
6550         return 0;
6551 }
6552
6553 late_initcall_sync(initialize_hashrnd);
6554