]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - net/core/dev.c
net: remove ndo_neigh_{construct, destroy} from stacked devices
[karo-tx-linux.git] / net / core / dev.c
1 /*
2  *      NET3    Protocol independent device support routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  *      Derived from the non IP parts of dev.c 1.0.19
10  *              Authors:        Ross Biro
11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *      Additional Authors:
15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
17  *              David Hinds <dahinds@users.sourceforge.net>
18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *              Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *      Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *                                      to 2 if register_netdev gets called
25  *                                      before net_dev_init & also removed a
26  *                                      few lines of code in the process.
27  *              Alan Cox        :       device private ioctl copies fields back.
28  *              Alan Cox        :       Transmit queue code does relevant
29  *                                      stunts to keep the queue safe.
30  *              Alan Cox        :       Fixed double lock.
31  *              Alan Cox        :       Fixed promisc NULL pointer trap
32  *              ????????        :       Support the full private ioctl range
33  *              Alan Cox        :       Moved ioctl permission check into
34  *                                      drivers
35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
36  *              Alan Cox        :       100 backlog just doesn't cut it when
37  *                                      you start doing multicast video 8)
38  *              Alan Cox        :       Rewrote net_bh and list manager.
39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
40  *              Alan Cox        :       Took out transmit every packet pass
41  *                                      Saved a few bytes in the ioctl handler
42  *              Alan Cox        :       Network driver sets packet type before
43  *                                      calling netif_rx. Saves a function
44  *                                      call a packet.
45  *              Alan Cox        :       Hashed net_bh()
46  *              Richard Kooijman:       Timestamp fixes.
47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
48  *              Alan Cox        :       Device lock protection.
49  *              Alan Cox        :       Fixed nasty side effect of device close
50  *                                      changes.
51  *              Rudi Cilibrasi  :       Pass the right thing to
52  *                                      set_mac_address()
53  *              Dave Miller     :       32bit quantity for the device lock to
54  *                                      make it work out on a Sparc.
55  *              Bjorn Ekwall    :       Added KERNELD hack.
56  *              Alan Cox        :       Cleaned up the backlog initialise.
57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
58  *                                      1 device.
59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
60  *                                      is no device open function.
61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
63  *              Cyrus Durgin    :       Cleaned for KMOD
64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
65  *                                      A network device unload needs to purge
66  *                                      the backlog queue.
67  *      Paul Rusty Russell      :       SIOCSIFNAME
68  *              Pekka Riikonen  :       Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *                                      indefinitely on dev->refcnt
71  *              J Hadi Salim    :       - Backlog queue sampling
72  *                                      - netif_rx() feedback
73  */
74
75 #include <linux/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <linux/bpf.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <net/busy_poll.h>
101 #include <linux/rtnetlink.h>
102 #include <linux/stat.h>
103 #include <net/dst.h>
104 #include <net/dst_metadata.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/iw_handler.h>
115 #include <asm/current.h>
116 #include <linux/audit.h>
117 #include <linux/dmaengine.h>
118 #include <linux/err.h>
119 #include <linux/ctype.h>
120 #include <linux/if_arp.h>
121 #include <linux/if_vlan.h>
122 #include <linux/ip.h>
123 #include <net/ip.h>
124 #include <net/mpls.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/static_key.h>
136 #include <linux/hashtable.h>
137 #include <linux/vmalloc.h>
138 #include <linux/if_macvlan.h>
139 #include <linux/errqueue.h>
140 #include <linux/hrtimer.h>
141 #include <linux/netfilter_ingress.h>
142 #include <linux/crash_dump.h>
143
144 #include "net-sysfs.h"
145
146 /* Instead of increasing this, you should create a hash table. */
147 #define MAX_GRO_SKBS 8
148
149 /* This should be increased if a protocol with a bigger head is added. */
150 #define GRO_MAX_HEAD (MAX_HEADER + 128)
151
152 static DEFINE_SPINLOCK(ptype_lock);
153 static DEFINE_SPINLOCK(offload_lock);
154 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
155 struct list_head ptype_all __read_mostly;       /* Taps */
156 static struct list_head offload_base __read_mostly;
157
158 static int netif_rx_internal(struct sk_buff *skb);
159 static int call_netdevice_notifiers_info(unsigned long val,
160                                          struct net_device *dev,
161                                          struct netdev_notifier_info *info);
162
163 /*
164  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
165  * semaphore.
166  *
167  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
168  *
169  * Writers must hold the rtnl semaphore while they loop through the
170  * dev_base_head list, and hold dev_base_lock for writing when they do the
171  * actual updates.  This allows pure readers to access the list even
172  * while a writer is preparing to update it.
173  *
174  * To put it another way, dev_base_lock is held for writing only to
175  * protect against pure readers; the rtnl semaphore provides the
176  * protection against other writers.
177  *
178  * See, for example usages, register_netdevice() and
179  * unregister_netdevice(), which must be called with the rtnl
180  * semaphore held.
181  */
182 DEFINE_RWLOCK(dev_base_lock);
183 EXPORT_SYMBOL(dev_base_lock);
184
185 /* protects napi_hash addition/deletion and napi_gen_id */
186 static DEFINE_SPINLOCK(napi_hash_lock);
187
188 static unsigned int napi_gen_id = NR_CPUS;
189 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
190
191 static seqcount_t devnet_rename_seq;
192
193 static inline void dev_base_seq_inc(struct net *net)
194 {
195         while (++net->dev_base_seq == 0);
196 }
197
198 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
199 {
200         unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
201
202         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
203 }
204
205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206 {
207         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
208 }
209
210 static inline void rps_lock(struct softnet_data *sd)
211 {
212 #ifdef CONFIG_RPS
213         spin_lock(&sd->input_pkt_queue.lock);
214 #endif
215 }
216
217 static inline void rps_unlock(struct softnet_data *sd)
218 {
219 #ifdef CONFIG_RPS
220         spin_unlock(&sd->input_pkt_queue.lock);
221 #endif
222 }
223
224 /* Device list insertion */
225 static void list_netdevice(struct net_device *dev)
226 {
227         struct net *net = dev_net(dev);
228
229         ASSERT_RTNL();
230
231         write_lock_bh(&dev_base_lock);
232         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
233         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
234         hlist_add_head_rcu(&dev->index_hlist,
235                            dev_index_hash(net, dev->ifindex));
236         write_unlock_bh(&dev_base_lock);
237
238         dev_base_seq_inc(net);
239 }
240
241 /* Device list removal
242  * caller must respect a RCU grace period before freeing/reusing dev
243  */
244 static void unlist_netdevice(struct net_device *dev)
245 {
246         ASSERT_RTNL();
247
248         /* Unlink dev from the device chain */
249         write_lock_bh(&dev_base_lock);
250         list_del_rcu(&dev->dev_list);
251         hlist_del_rcu(&dev->name_hlist);
252         hlist_del_rcu(&dev->index_hlist);
253         write_unlock_bh(&dev_base_lock);
254
255         dev_base_seq_inc(dev_net(dev));
256 }
257
258 /*
259  *      Our notifier list
260  */
261
262 static RAW_NOTIFIER_HEAD(netdev_chain);
263
264 /*
265  *      Device drivers call our routines to queue packets here. We empty the
266  *      queue in the local softnet handler.
267  */
268
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270 EXPORT_PER_CPU_SYMBOL(softnet_data);
271
272 #ifdef CONFIG_LOCKDEP
273 /*
274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275  * according to dev->type
276  */
277 static const unsigned short netdev_lock_type[] =
278         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
291          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
292          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
293
294 static const char *const netdev_lock_name[] =
295         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
308          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
309          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
310
311 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
312 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
313
314 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
315 {
316         int i;
317
318         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319                 if (netdev_lock_type[i] == dev_type)
320                         return i;
321         /* the last key is used by default */
322         return ARRAY_SIZE(netdev_lock_type) - 1;
323 }
324
325 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326                                                  unsigned short dev_type)
327 {
328         int i;
329
330         i = netdev_lock_pos(dev_type);
331         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332                                    netdev_lock_name[i]);
333 }
334
335 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
336 {
337         int i;
338
339         i = netdev_lock_pos(dev->type);
340         lockdep_set_class_and_name(&dev->addr_list_lock,
341                                    &netdev_addr_lock_key[i],
342                                    netdev_lock_name[i]);
343 }
344 #else
345 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346                                                  unsigned short dev_type)
347 {
348 }
349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
350 {
351 }
352 #endif
353
354 /*******************************************************************************
355
356                 Protocol management and registration routines
357
358 *******************************************************************************/
359
360 /*
361  *      Add a protocol ID to the list. Now that the input handler is
362  *      smarter we can dispense with all the messy stuff that used to be
363  *      here.
364  *
365  *      BEWARE!!! Protocol handlers, mangling input packets,
366  *      MUST BE last in hash buckets and checking protocol handlers
367  *      MUST start from promiscuous ptype_all chain in net_bh.
368  *      It is true now, do not change it.
369  *      Explanation follows: if protocol handler, mangling packet, will
370  *      be the first on list, it is not able to sense, that packet
371  *      is cloned and should be copied-on-write, so that it will
372  *      change it and subsequent readers will get broken packet.
373  *                                                      --ANK (980803)
374  */
375
376 static inline struct list_head *ptype_head(const struct packet_type *pt)
377 {
378         if (pt->type == htons(ETH_P_ALL))
379                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
380         else
381                 return pt->dev ? &pt->dev->ptype_specific :
382                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383 }
384
385 /**
386  *      dev_add_pack - add packet handler
387  *      @pt: packet type declaration
388  *
389  *      Add a protocol handler to the networking stack. The passed &packet_type
390  *      is linked into kernel lists and may not be freed until it has been
391  *      removed from the kernel lists.
392  *
393  *      This call does not sleep therefore it can not
394  *      guarantee all CPU's that are in middle of receiving packets
395  *      will see the new packet type (until the next received packet).
396  */
397
398 void dev_add_pack(struct packet_type *pt)
399 {
400         struct list_head *head = ptype_head(pt);
401
402         spin_lock(&ptype_lock);
403         list_add_rcu(&pt->list, head);
404         spin_unlock(&ptype_lock);
405 }
406 EXPORT_SYMBOL(dev_add_pack);
407
408 /**
409  *      __dev_remove_pack        - remove packet handler
410  *      @pt: packet type declaration
411  *
412  *      Remove a protocol handler that was previously added to the kernel
413  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
414  *      from the kernel lists and can be freed or reused once this function
415  *      returns.
416  *
417  *      The packet type might still be in use by receivers
418  *      and must not be freed until after all the CPU's have gone
419  *      through a quiescent state.
420  */
421 void __dev_remove_pack(struct packet_type *pt)
422 {
423         struct list_head *head = ptype_head(pt);
424         struct packet_type *pt1;
425
426         spin_lock(&ptype_lock);
427
428         list_for_each_entry(pt1, head, list) {
429                 if (pt == pt1) {
430                         list_del_rcu(&pt->list);
431                         goto out;
432                 }
433         }
434
435         pr_warn("dev_remove_pack: %p not found\n", pt);
436 out:
437         spin_unlock(&ptype_lock);
438 }
439 EXPORT_SYMBOL(__dev_remove_pack);
440
441 /**
442  *      dev_remove_pack  - remove packet handler
443  *      @pt: packet type declaration
444  *
445  *      Remove a protocol handler that was previously added to the kernel
446  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
447  *      from the kernel lists and can be freed or reused once this function
448  *      returns.
449  *
450  *      This call sleeps to guarantee that no CPU is looking at the packet
451  *      type after return.
452  */
453 void dev_remove_pack(struct packet_type *pt)
454 {
455         __dev_remove_pack(pt);
456
457         synchronize_net();
458 }
459 EXPORT_SYMBOL(dev_remove_pack);
460
461
462 /**
463  *      dev_add_offload - register offload handlers
464  *      @po: protocol offload declaration
465  *
466  *      Add protocol offload handlers to the networking stack. The passed
467  *      &proto_offload is linked into kernel lists and may not be freed until
468  *      it has been removed from the kernel lists.
469  *
470  *      This call does not sleep therefore it can not
471  *      guarantee all CPU's that are in middle of receiving packets
472  *      will see the new offload handlers (until the next received packet).
473  */
474 void dev_add_offload(struct packet_offload *po)
475 {
476         struct packet_offload *elem;
477
478         spin_lock(&offload_lock);
479         list_for_each_entry(elem, &offload_base, list) {
480                 if (po->priority < elem->priority)
481                         break;
482         }
483         list_add_rcu(&po->list, elem->list.prev);
484         spin_unlock(&offload_lock);
485 }
486 EXPORT_SYMBOL(dev_add_offload);
487
488 /**
489  *      __dev_remove_offload     - remove offload handler
490  *      @po: packet offload declaration
491  *
492  *      Remove a protocol offload handler that was previously added to the
493  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
494  *      is removed from the kernel lists and can be freed or reused once this
495  *      function returns.
496  *
497  *      The packet type might still be in use by receivers
498  *      and must not be freed until after all the CPU's have gone
499  *      through a quiescent state.
500  */
501 static void __dev_remove_offload(struct packet_offload *po)
502 {
503         struct list_head *head = &offload_base;
504         struct packet_offload *po1;
505
506         spin_lock(&offload_lock);
507
508         list_for_each_entry(po1, head, list) {
509                 if (po == po1) {
510                         list_del_rcu(&po->list);
511                         goto out;
512                 }
513         }
514
515         pr_warn("dev_remove_offload: %p not found\n", po);
516 out:
517         spin_unlock(&offload_lock);
518 }
519
520 /**
521  *      dev_remove_offload       - remove packet offload handler
522  *      @po: packet offload declaration
523  *
524  *      Remove a packet offload handler that was previously added to the kernel
525  *      offload handlers by dev_add_offload(). The passed &offload_type is
526  *      removed from the kernel lists and can be freed or reused once this
527  *      function returns.
528  *
529  *      This call sleeps to guarantee that no CPU is looking at the packet
530  *      type after return.
531  */
532 void dev_remove_offload(struct packet_offload *po)
533 {
534         __dev_remove_offload(po);
535
536         synchronize_net();
537 }
538 EXPORT_SYMBOL(dev_remove_offload);
539
540 /******************************************************************************
541
542                       Device Boot-time Settings Routines
543
544 *******************************************************************************/
545
546 /* Boot time configuration table */
547 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
548
549 /**
550  *      netdev_boot_setup_add   - add new setup entry
551  *      @name: name of the device
552  *      @map: configured settings for the device
553  *
554  *      Adds new setup entry to the dev_boot_setup list.  The function
555  *      returns 0 on error and 1 on success.  This is a generic routine to
556  *      all netdevices.
557  */
558 static int netdev_boot_setup_add(char *name, struct ifmap *map)
559 {
560         struct netdev_boot_setup *s;
561         int i;
562
563         s = dev_boot_setup;
564         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
565                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
566                         memset(s[i].name, 0, sizeof(s[i].name));
567                         strlcpy(s[i].name, name, IFNAMSIZ);
568                         memcpy(&s[i].map, map, sizeof(s[i].map));
569                         break;
570                 }
571         }
572
573         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
574 }
575
576 /**
577  *      netdev_boot_setup_check - check boot time settings
578  *      @dev: the netdevice
579  *
580  *      Check boot time settings for the device.
581  *      The found settings are set for the device to be used
582  *      later in the device probing.
583  *      Returns 0 if no settings found, 1 if they are.
584  */
585 int netdev_boot_setup_check(struct net_device *dev)
586 {
587         struct netdev_boot_setup *s = dev_boot_setup;
588         int i;
589
590         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
591                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
592                     !strcmp(dev->name, s[i].name)) {
593                         dev->irq        = s[i].map.irq;
594                         dev->base_addr  = s[i].map.base_addr;
595                         dev->mem_start  = s[i].map.mem_start;
596                         dev->mem_end    = s[i].map.mem_end;
597                         return 1;
598                 }
599         }
600         return 0;
601 }
602 EXPORT_SYMBOL(netdev_boot_setup_check);
603
604
605 /**
606  *      netdev_boot_base        - get address from boot time settings
607  *      @prefix: prefix for network device
608  *      @unit: id for network device
609  *
610  *      Check boot time settings for the base address of device.
611  *      The found settings are set for the device to be used
612  *      later in the device probing.
613  *      Returns 0 if no settings found.
614  */
615 unsigned long netdev_boot_base(const char *prefix, int unit)
616 {
617         const struct netdev_boot_setup *s = dev_boot_setup;
618         char name[IFNAMSIZ];
619         int i;
620
621         sprintf(name, "%s%d", prefix, unit);
622
623         /*
624          * If device already registered then return base of 1
625          * to indicate not to probe for this interface
626          */
627         if (__dev_get_by_name(&init_net, name))
628                 return 1;
629
630         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
631                 if (!strcmp(name, s[i].name))
632                         return s[i].map.base_addr;
633         return 0;
634 }
635
636 /*
637  * Saves at boot time configured settings for any netdevice.
638  */
639 int __init netdev_boot_setup(char *str)
640 {
641         int ints[5];
642         struct ifmap map;
643
644         str = get_options(str, ARRAY_SIZE(ints), ints);
645         if (!str || !*str)
646                 return 0;
647
648         /* Save settings */
649         memset(&map, 0, sizeof(map));
650         if (ints[0] > 0)
651                 map.irq = ints[1];
652         if (ints[0] > 1)
653                 map.base_addr = ints[2];
654         if (ints[0] > 2)
655                 map.mem_start = ints[3];
656         if (ints[0] > 3)
657                 map.mem_end = ints[4];
658
659         /* Add new entry to the list */
660         return netdev_boot_setup_add(str, &map);
661 }
662
663 __setup("netdev=", netdev_boot_setup);
664
665 /*******************************************************************************
666
667                             Device Interface Subroutines
668
669 *******************************************************************************/
670
671 /**
672  *      dev_get_iflink  - get 'iflink' value of a interface
673  *      @dev: targeted interface
674  *
675  *      Indicates the ifindex the interface is linked to.
676  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
677  */
678
679 int dev_get_iflink(const struct net_device *dev)
680 {
681         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
682                 return dev->netdev_ops->ndo_get_iflink(dev);
683
684         return dev->ifindex;
685 }
686 EXPORT_SYMBOL(dev_get_iflink);
687
688 /**
689  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
690  *      @dev: targeted interface
691  *      @skb: The packet.
692  *
693  *      For better visibility of tunnel traffic OVS needs to retrieve
694  *      egress tunnel information for a packet. Following API allows
695  *      user to get this info.
696  */
697 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
698 {
699         struct ip_tunnel_info *info;
700
701         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
702                 return -EINVAL;
703
704         info = skb_tunnel_info_unclone(skb);
705         if (!info)
706                 return -ENOMEM;
707         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
708                 return -EINVAL;
709
710         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
711 }
712 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
713
714 /**
715  *      __dev_get_by_name       - find a device by its name
716  *      @net: the applicable net namespace
717  *      @name: name to find
718  *
719  *      Find an interface by name. Must be called under RTNL semaphore
720  *      or @dev_base_lock. If the name is found a pointer to the device
721  *      is returned. If the name is not found then %NULL is returned. The
722  *      reference counters are not incremented so the caller must be
723  *      careful with locks.
724  */
725
726 struct net_device *__dev_get_by_name(struct net *net, const char *name)
727 {
728         struct net_device *dev;
729         struct hlist_head *head = dev_name_hash(net, name);
730
731         hlist_for_each_entry(dev, head, name_hlist)
732                 if (!strncmp(dev->name, name, IFNAMSIZ))
733                         return dev;
734
735         return NULL;
736 }
737 EXPORT_SYMBOL(__dev_get_by_name);
738
739 /**
740  *      dev_get_by_name_rcu     - find a device by its name
741  *      @net: the applicable net namespace
742  *      @name: name to find
743  *
744  *      Find an interface by name.
745  *      If the name is found a pointer to the device is returned.
746  *      If the name is not found then %NULL is returned.
747  *      The reference counters are not incremented so the caller must be
748  *      careful with locks. The caller must hold RCU lock.
749  */
750
751 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
752 {
753         struct net_device *dev;
754         struct hlist_head *head = dev_name_hash(net, name);
755
756         hlist_for_each_entry_rcu(dev, head, name_hlist)
757                 if (!strncmp(dev->name, name, IFNAMSIZ))
758                         return dev;
759
760         return NULL;
761 }
762 EXPORT_SYMBOL(dev_get_by_name_rcu);
763
764 /**
765  *      dev_get_by_name         - find a device by its name
766  *      @net: the applicable net namespace
767  *      @name: name to find
768  *
769  *      Find an interface by name. This can be called from any
770  *      context and does its own locking. The returned handle has
771  *      the usage count incremented and the caller must use dev_put() to
772  *      release it when it is no longer needed. %NULL is returned if no
773  *      matching device is found.
774  */
775
776 struct net_device *dev_get_by_name(struct net *net, const char *name)
777 {
778         struct net_device *dev;
779
780         rcu_read_lock();
781         dev = dev_get_by_name_rcu(net, name);
782         if (dev)
783                 dev_hold(dev);
784         rcu_read_unlock();
785         return dev;
786 }
787 EXPORT_SYMBOL(dev_get_by_name);
788
789 /**
790  *      __dev_get_by_index - find a device by its ifindex
791  *      @net: the applicable net namespace
792  *      @ifindex: index of device
793  *
794  *      Search for an interface by index. Returns %NULL if the device
795  *      is not found or a pointer to the device. The device has not
796  *      had its reference counter increased so the caller must be careful
797  *      about locking. The caller must hold either the RTNL semaphore
798  *      or @dev_base_lock.
799  */
800
801 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
802 {
803         struct net_device *dev;
804         struct hlist_head *head = dev_index_hash(net, ifindex);
805
806         hlist_for_each_entry(dev, head, index_hlist)
807                 if (dev->ifindex == ifindex)
808                         return dev;
809
810         return NULL;
811 }
812 EXPORT_SYMBOL(__dev_get_by_index);
813
814 /**
815  *      dev_get_by_index_rcu - find a device by its ifindex
816  *      @net: the applicable net namespace
817  *      @ifindex: index of device
818  *
819  *      Search for an interface by index. Returns %NULL if the device
820  *      is not found or a pointer to the device. The device has not
821  *      had its reference counter increased so the caller must be careful
822  *      about locking. The caller must hold RCU lock.
823  */
824
825 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
826 {
827         struct net_device *dev;
828         struct hlist_head *head = dev_index_hash(net, ifindex);
829
830         hlist_for_each_entry_rcu(dev, head, index_hlist)
831                 if (dev->ifindex == ifindex)
832                         return dev;
833
834         return NULL;
835 }
836 EXPORT_SYMBOL(dev_get_by_index_rcu);
837
838
839 /**
840  *      dev_get_by_index - find a device by its ifindex
841  *      @net: the applicable net namespace
842  *      @ifindex: index of device
843  *
844  *      Search for an interface by index. Returns NULL if the device
845  *      is not found or a pointer to the device. The device returned has
846  *      had a reference added and the pointer is safe until the user calls
847  *      dev_put to indicate they have finished with it.
848  */
849
850 struct net_device *dev_get_by_index(struct net *net, int ifindex)
851 {
852         struct net_device *dev;
853
854         rcu_read_lock();
855         dev = dev_get_by_index_rcu(net, ifindex);
856         if (dev)
857                 dev_hold(dev);
858         rcu_read_unlock();
859         return dev;
860 }
861 EXPORT_SYMBOL(dev_get_by_index);
862
863 /**
864  *      netdev_get_name - get a netdevice name, knowing its ifindex.
865  *      @net: network namespace
866  *      @name: a pointer to the buffer where the name will be stored.
867  *      @ifindex: the ifindex of the interface to get the name from.
868  *
869  *      The use of raw_seqcount_begin() and cond_resched() before
870  *      retrying is required as we want to give the writers a chance
871  *      to complete when CONFIG_PREEMPT is not set.
872  */
873 int netdev_get_name(struct net *net, char *name, int ifindex)
874 {
875         struct net_device *dev;
876         unsigned int seq;
877
878 retry:
879         seq = raw_seqcount_begin(&devnet_rename_seq);
880         rcu_read_lock();
881         dev = dev_get_by_index_rcu(net, ifindex);
882         if (!dev) {
883                 rcu_read_unlock();
884                 return -ENODEV;
885         }
886
887         strcpy(name, dev->name);
888         rcu_read_unlock();
889         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
890                 cond_resched();
891                 goto retry;
892         }
893
894         return 0;
895 }
896
897 /**
898  *      dev_getbyhwaddr_rcu - find a device by its hardware address
899  *      @net: the applicable net namespace
900  *      @type: media type of device
901  *      @ha: hardware address
902  *
903  *      Search for an interface by MAC address. Returns NULL if the device
904  *      is not found or a pointer to the device.
905  *      The caller must hold RCU or RTNL.
906  *      The returned device has not had its ref count increased
907  *      and the caller must therefore be careful about locking
908  *
909  */
910
911 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
912                                        const char *ha)
913 {
914         struct net_device *dev;
915
916         for_each_netdev_rcu(net, dev)
917                 if (dev->type == type &&
918                     !memcmp(dev->dev_addr, ha, dev->addr_len))
919                         return dev;
920
921         return NULL;
922 }
923 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
924
925 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
926 {
927         struct net_device *dev;
928
929         ASSERT_RTNL();
930         for_each_netdev(net, dev)
931                 if (dev->type == type)
932                         return dev;
933
934         return NULL;
935 }
936 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
937
938 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
939 {
940         struct net_device *dev, *ret = NULL;
941
942         rcu_read_lock();
943         for_each_netdev_rcu(net, dev)
944                 if (dev->type == type) {
945                         dev_hold(dev);
946                         ret = dev;
947                         break;
948                 }
949         rcu_read_unlock();
950         return ret;
951 }
952 EXPORT_SYMBOL(dev_getfirstbyhwtype);
953
954 /**
955  *      __dev_get_by_flags - find any device with given flags
956  *      @net: the applicable net namespace
957  *      @if_flags: IFF_* values
958  *      @mask: bitmask of bits in if_flags to check
959  *
960  *      Search for any interface with the given flags. Returns NULL if a device
961  *      is not found or a pointer to the device. Must be called inside
962  *      rtnl_lock(), and result refcount is unchanged.
963  */
964
965 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
966                                       unsigned short mask)
967 {
968         struct net_device *dev, *ret;
969
970         ASSERT_RTNL();
971
972         ret = NULL;
973         for_each_netdev(net, dev) {
974                 if (((dev->flags ^ if_flags) & mask) == 0) {
975                         ret = dev;
976                         break;
977                 }
978         }
979         return ret;
980 }
981 EXPORT_SYMBOL(__dev_get_by_flags);
982
983 /**
984  *      dev_valid_name - check if name is okay for network device
985  *      @name: name string
986  *
987  *      Network device names need to be valid file names to
988  *      to allow sysfs to work.  We also disallow any kind of
989  *      whitespace.
990  */
991 bool dev_valid_name(const char *name)
992 {
993         if (*name == '\0')
994                 return false;
995         if (strlen(name) >= IFNAMSIZ)
996                 return false;
997         if (!strcmp(name, ".") || !strcmp(name, ".."))
998                 return false;
999
1000         while (*name) {
1001                 if (*name == '/' || *name == ':' || isspace(*name))
1002                         return false;
1003                 name++;
1004         }
1005         return true;
1006 }
1007 EXPORT_SYMBOL(dev_valid_name);
1008
1009 /**
1010  *      __dev_alloc_name - allocate a name for a device
1011  *      @net: network namespace to allocate the device name in
1012  *      @name: name format string
1013  *      @buf:  scratch buffer and result name string
1014  *
1015  *      Passed a format string - eg "lt%d" it will try and find a suitable
1016  *      id. It scans list of devices to build up a free map, then chooses
1017  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1018  *      while allocating the name and adding the device in order to avoid
1019  *      duplicates.
1020  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021  *      Returns the number of the unit assigned or a negative errno code.
1022  */
1023
1024 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025 {
1026         int i = 0;
1027         const char *p;
1028         const int max_netdevices = 8*PAGE_SIZE;
1029         unsigned long *inuse;
1030         struct net_device *d;
1031
1032         p = strnchr(name, IFNAMSIZ-1, '%');
1033         if (p) {
1034                 /*
1035                  * Verify the string as this thing may have come from
1036                  * the user.  There must be either one "%d" and no other "%"
1037                  * characters.
1038                  */
1039                 if (p[1] != 'd' || strchr(p + 2, '%'))
1040                         return -EINVAL;
1041
1042                 /* Use one page as a bit array of possible slots */
1043                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044                 if (!inuse)
1045                         return -ENOMEM;
1046
1047                 for_each_netdev(net, d) {
1048                         if (!sscanf(d->name, name, &i))
1049                                 continue;
1050                         if (i < 0 || i >= max_netdevices)
1051                                 continue;
1052
1053                         /*  avoid cases where sscanf is not exact inverse of printf */
1054                         snprintf(buf, IFNAMSIZ, name, i);
1055                         if (!strncmp(buf, d->name, IFNAMSIZ))
1056                                 set_bit(i, inuse);
1057                 }
1058
1059                 i = find_first_zero_bit(inuse, max_netdevices);
1060                 free_page((unsigned long) inuse);
1061         }
1062
1063         if (buf != name)
1064                 snprintf(buf, IFNAMSIZ, name, i);
1065         if (!__dev_get_by_name(net, buf))
1066                 return i;
1067
1068         /* It is possible to run out of possible slots
1069          * when the name is long and there isn't enough space left
1070          * for the digits, or if all bits are used.
1071          */
1072         return -ENFILE;
1073 }
1074
1075 /**
1076  *      dev_alloc_name - allocate a name for a device
1077  *      @dev: device
1078  *      @name: name format string
1079  *
1080  *      Passed a format string - eg "lt%d" it will try and find a suitable
1081  *      id. It scans list of devices to build up a free map, then chooses
1082  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1083  *      while allocating the name and adding the device in order to avoid
1084  *      duplicates.
1085  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086  *      Returns the number of the unit assigned or a negative errno code.
1087  */
1088
1089 int dev_alloc_name(struct net_device *dev, const char *name)
1090 {
1091         char buf[IFNAMSIZ];
1092         struct net *net;
1093         int ret;
1094
1095         BUG_ON(!dev_net(dev));
1096         net = dev_net(dev);
1097         ret = __dev_alloc_name(net, name, buf);
1098         if (ret >= 0)
1099                 strlcpy(dev->name, buf, IFNAMSIZ);
1100         return ret;
1101 }
1102 EXPORT_SYMBOL(dev_alloc_name);
1103
1104 static int dev_alloc_name_ns(struct net *net,
1105                              struct net_device *dev,
1106                              const char *name)
1107 {
1108         char buf[IFNAMSIZ];
1109         int ret;
1110
1111         ret = __dev_alloc_name(net, name, buf);
1112         if (ret >= 0)
1113                 strlcpy(dev->name, buf, IFNAMSIZ);
1114         return ret;
1115 }
1116
1117 static int dev_get_valid_name(struct net *net,
1118                               struct net_device *dev,
1119                               const char *name)
1120 {
1121         BUG_ON(!net);
1122
1123         if (!dev_valid_name(name))
1124                 return -EINVAL;
1125
1126         if (strchr(name, '%'))
1127                 return dev_alloc_name_ns(net, dev, name);
1128         else if (__dev_get_by_name(net, name))
1129                 return -EEXIST;
1130         else if (dev->name != name)
1131                 strlcpy(dev->name, name, IFNAMSIZ);
1132
1133         return 0;
1134 }
1135
1136 /**
1137  *      dev_change_name - change name of a device
1138  *      @dev: device
1139  *      @newname: name (or format string) must be at least IFNAMSIZ
1140  *
1141  *      Change name of a device, can pass format strings "eth%d".
1142  *      for wildcarding.
1143  */
1144 int dev_change_name(struct net_device *dev, const char *newname)
1145 {
1146         unsigned char old_assign_type;
1147         char oldname[IFNAMSIZ];
1148         int err = 0;
1149         int ret;
1150         struct net *net;
1151
1152         ASSERT_RTNL();
1153         BUG_ON(!dev_net(dev));
1154
1155         net = dev_net(dev);
1156         if (dev->flags & IFF_UP)
1157                 return -EBUSY;
1158
1159         write_seqcount_begin(&devnet_rename_seq);
1160
1161         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162                 write_seqcount_end(&devnet_rename_seq);
1163                 return 0;
1164         }
1165
1166         memcpy(oldname, dev->name, IFNAMSIZ);
1167
1168         err = dev_get_valid_name(net, dev, newname);
1169         if (err < 0) {
1170                 write_seqcount_end(&devnet_rename_seq);
1171                 return err;
1172         }
1173
1174         if (oldname[0] && !strchr(oldname, '%'))
1175                 netdev_info(dev, "renamed from %s\n", oldname);
1176
1177         old_assign_type = dev->name_assign_type;
1178         dev->name_assign_type = NET_NAME_RENAMED;
1179
1180 rollback:
1181         ret = device_rename(&dev->dev, dev->name);
1182         if (ret) {
1183                 memcpy(dev->name, oldname, IFNAMSIZ);
1184                 dev->name_assign_type = old_assign_type;
1185                 write_seqcount_end(&devnet_rename_seq);
1186                 return ret;
1187         }
1188
1189         write_seqcount_end(&devnet_rename_seq);
1190
1191         netdev_adjacent_rename_links(dev, oldname);
1192
1193         write_lock_bh(&dev_base_lock);
1194         hlist_del_rcu(&dev->name_hlist);
1195         write_unlock_bh(&dev_base_lock);
1196
1197         synchronize_rcu();
1198
1199         write_lock_bh(&dev_base_lock);
1200         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201         write_unlock_bh(&dev_base_lock);
1202
1203         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204         ret = notifier_to_errno(ret);
1205
1206         if (ret) {
1207                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1208                 if (err >= 0) {
1209                         err = ret;
1210                         write_seqcount_begin(&devnet_rename_seq);
1211                         memcpy(dev->name, oldname, IFNAMSIZ);
1212                         memcpy(oldname, newname, IFNAMSIZ);
1213                         dev->name_assign_type = old_assign_type;
1214                         old_assign_type = NET_NAME_RENAMED;
1215                         goto rollback;
1216                 } else {
1217                         pr_err("%s: name change rollback failed: %d\n",
1218                                dev->name, ret);
1219                 }
1220         }
1221
1222         return err;
1223 }
1224
1225 /**
1226  *      dev_set_alias - change ifalias of a device
1227  *      @dev: device
1228  *      @alias: name up to IFALIASZ
1229  *      @len: limit of bytes to copy from info
1230  *
1231  *      Set ifalias for a device,
1232  */
1233 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234 {
1235         char *new_ifalias;
1236
1237         ASSERT_RTNL();
1238
1239         if (len >= IFALIASZ)
1240                 return -EINVAL;
1241
1242         if (!len) {
1243                 kfree(dev->ifalias);
1244                 dev->ifalias = NULL;
1245                 return 0;
1246         }
1247
1248         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249         if (!new_ifalias)
1250                 return -ENOMEM;
1251         dev->ifalias = new_ifalias;
1252
1253         strlcpy(dev->ifalias, alias, len+1);
1254         return len;
1255 }
1256
1257
1258 /**
1259  *      netdev_features_change - device changes features
1260  *      @dev: device to cause notification
1261  *
1262  *      Called to indicate a device has changed features.
1263  */
1264 void netdev_features_change(struct net_device *dev)
1265 {
1266         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267 }
1268 EXPORT_SYMBOL(netdev_features_change);
1269
1270 /**
1271  *      netdev_state_change - device changes state
1272  *      @dev: device to cause notification
1273  *
1274  *      Called to indicate a device has changed state. This function calls
1275  *      the notifier chains for netdev_chain and sends a NEWLINK message
1276  *      to the routing socket.
1277  */
1278 void netdev_state_change(struct net_device *dev)
1279 {
1280         if (dev->flags & IFF_UP) {
1281                 struct netdev_notifier_change_info change_info;
1282
1283                 change_info.flags_changed = 0;
1284                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285                                               &change_info.info);
1286                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287         }
1288 }
1289 EXPORT_SYMBOL(netdev_state_change);
1290
1291 /**
1292  *      netdev_notify_peers - notify network peers about existence of @dev
1293  *      @dev: network device
1294  *
1295  * Generate traffic such that interested network peers are aware of
1296  * @dev, such as by generating a gratuitous ARP. This may be used when
1297  * a device wants to inform the rest of the network about some sort of
1298  * reconfiguration such as a failover event or virtual machine
1299  * migration.
1300  */
1301 void netdev_notify_peers(struct net_device *dev)
1302 {
1303         rtnl_lock();
1304         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305         rtnl_unlock();
1306 }
1307 EXPORT_SYMBOL(netdev_notify_peers);
1308
1309 static int __dev_open(struct net_device *dev)
1310 {
1311         const struct net_device_ops *ops = dev->netdev_ops;
1312         int ret;
1313
1314         ASSERT_RTNL();
1315
1316         if (!netif_device_present(dev))
1317                 return -ENODEV;
1318
1319         /* Block netpoll from trying to do any rx path servicing.
1320          * If we don't do this there is a chance ndo_poll_controller
1321          * or ndo_poll may be running while we open the device
1322          */
1323         netpoll_poll_disable(dev);
1324
1325         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326         ret = notifier_to_errno(ret);
1327         if (ret)
1328                 return ret;
1329
1330         set_bit(__LINK_STATE_START, &dev->state);
1331
1332         if (ops->ndo_validate_addr)
1333                 ret = ops->ndo_validate_addr(dev);
1334
1335         if (!ret && ops->ndo_open)
1336                 ret = ops->ndo_open(dev);
1337
1338         netpoll_poll_enable(dev);
1339
1340         if (ret)
1341                 clear_bit(__LINK_STATE_START, &dev->state);
1342         else {
1343                 dev->flags |= IFF_UP;
1344                 dev_set_rx_mode(dev);
1345                 dev_activate(dev);
1346                 add_device_randomness(dev->dev_addr, dev->addr_len);
1347         }
1348
1349         return ret;
1350 }
1351
1352 /**
1353  *      dev_open        - prepare an interface for use.
1354  *      @dev:   device to open
1355  *
1356  *      Takes a device from down to up state. The device's private open
1357  *      function is invoked and then the multicast lists are loaded. Finally
1358  *      the device is moved into the up state and a %NETDEV_UP message is
1359  *      sent to the netdev notifier chain.
1360  *
1361  *      Calling this function on an active interface is a nop. On a failure
1362  *      a negative errno code is returned.
1363  */
1364 int dev_open(struct net_device *dev)
1365 {
1366         int ret;
1367
1368         if (dev->flags & IFF_UP)
1369                 return 0;
1370
1371         ret = __dev_open(dev);
1372         if (ret < 0)
1373                 return ret;
1374
1375         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376         call_netdevice_notifiers(NETDEV_UP, dev);
1377
1378         return ret;
1379 }
1380 EXPORT_SYMBOL(dev_open);
1381
1382 static int __dev_close_many(struct list_head *head)
1383 {
1384         struct net_device *dev;
1385
1386         ASSERT_RTNL();
1387         might_sleep();
1388
1389         list_for_each_entry(dev, head, close_list) {
1390                 /* Temporarily disable netpoll until the interface is down */
1391                 netpoll_poll_disable(dev);
1392
1393                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394
1395                 clear_bit(__LINK_STATE_START, &dev->state);
1396
1397                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1398                  * can be even on different cpu. So just clear netif_running().
1399                  *
1400                  * dev->stop() will invoke napi_disable() on all of it's
1401                  * napi_struct instances on this device.
1402                  */
1403                 smp_mb__after_atomic(); /* Commit netif_running(). */
1404         }
1405
1406         dev_deactivate_many(head);
1407
1408         list_for_each_entry(dev, head, close_list) {
1409                 const struct net_device_ops *ops = dev->netdev_ops;
1410
1411                 /*
1412                  *      Call the device specific close. This cannot fail.
1413                  *      Only if device is UP
1414                  *
1415                  *      We allow it to be called even after a DETACH hot-plug
1416                  *      event.
1417                  */
1418                 if (ops->ndo_stop)
1419                         ops->ndo_stop(dev);
1420
1421                 dev->flags &= ~IFF_UP;
1422                 netpoll_poll_enable(dev);
1423         }
1424
1425         return 0;
1426 }
1427
1428 static int __dev_close(struct net_device *dev)
1429 {
1430         int retval;
1431         LIST_HEAD(single);
1432
1433         list_add(&dev->close_list, &single);
1434         retval = __dev_close_many(&single);
1435         list_del(&single);
1436
1437         return retval;
1438 }
1439
1440 int dev_close_many(struct list_head *head, bool unlink)
1441 {
1442         struct net_device *dev, *tmp;
1443
1444         /* Remove the devices that don't need to be closed */
1445         list_for_each_entry_safe(dev, tmp, head, close_list)
1446                 if (!(dev->flags & IFF_UP))
1447                         list_del_init(&dev->close_list);
1448
1449         __dev_close_many(head);
1450
1451         list_for_each_entry_safe(dev, tmp, head, close_list) {
1452                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1454                 if (unlink)
1455                         list_del_init(&dev->close_list);
1456         }
1457
1458         return 0;
1459 }
1460 EXPORT_SYMBOL(dev_close_many);
1461
1462 /**
1463  *      dev_close - shutdown an interface.
1464  *      @dev: device to shutdown
1465  *
1466  *      This function moves an active device into down state. A
1467  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469  *      chain.
1470  */
1471 int dev_close(struct net_device *dev)
1472 {
1473         if (dev->flags & IFF_UP) {
1474                 LIST_HEAD(single);
1475
1476                 list_add(&dev->close_list, &single);
1477                 dev_close_many(&single, true);
1478                 list_del(&single);
1479         }
1480         return 0;
1481 }
1482 EXPORT_SYMBOL(dev_close);
1483
1484
1485 /**
1486  *      dev_disable_lro - disable Large Receive Offload on a device
1487  *      @dev: device
1488  *
1489  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1490  *      called under RTNL.  This is needed if received packets may be
1491  *      forwarded to another interface.
1492  */
1493 void dev_disable_lro(struct net_device *dev)
1494 {
1495         struct net_device *lower_dev;
1496         struct list_head *iter;
1497
1498         dev->wanted_features &= ~NETIF_F_LRO;
1499         netdev_update_features(dev);
1500
1501         if (unlikely(dev->features & NETIF_F_LRO))
1502                 netdev_WARN(dev, "failed to disable LRO!\n");
1503
1504         netdev_for_each_lower_dev(dev, lower_dev, iter)
1505                 dev_disable_lro(lower_dev);
1506 }
1507 EXPORT_SYMBOL(dev_disable_lro);
1508
1509 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510                                    struct net_device *dev)
1511 {
1512         struct netdev_notifier_info info;
1513
1514         netdev_notifier_info_init(&info, dev);
1515         return nb->notifier_call(nb, val, &info);
1516 }
1517
1518 static int dev_boot_phase = 1;
1519
1520 /**
1521  *      register_netdevice_notifier - register a network notifier block
1522  *      @nb: notifier
1523  *
1524  *      Register a notifier to be called when network device events occur.
1525  *      The notifier passed is linked into the kernel structures and must
1526  *      not be reused until it has been unregistered. A negative errno code
1527  *      is returned on a failure.
1528  *
1529  *      When registered all registration and up events are replayed
1530  *      to the new notifier to allow device to have a race free
1531  *      view of the network device list.
1532  */
1533
1534 int register_netdevice_notifier(struct notifier_block *nb)
1535 {
1536         struct net_device *dev;
1537         struct net_device *last;
1538         struct net *net;
1539         int err;
1540
1541         rtnl_lock();
1542         err = raw_notifier_chain_register(&netdev_chain, nb);
1543         if (err)
1544                 goto unlock;
1545         if (dev_boot_phase)
1546                 goto unlock;
1547         for_each_net(net) {
1548                 for_each_netdev(net, dev) {
1549                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550                         err = notifier_to_errno(err);
1551                         if (err)
1552                                 goto rollback;
1553
1554                         if (!(dev->flags & IFF_UP))
1555                                 continue;
1556
1557                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1558                 }
1559         }
1560
1561 unlock:
1562         rtnl_unlock();
1563         return err;
1564
1565 rollback:
1566         last = dev;
1567         for_each_net(net) {
1568                 for_each_netdev(net, dev) {
1569                         if (dev == last)
1570                                 goto outroll;
1571
1572                         if (dev->flags & IFF_UP) {
1573                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574                                                         dev);
1575                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576                         }
1577                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578                 }
1579         }
1580
1581 outroll:
1582         raw_notifier_chain_unregister(&netdev_chain, nb);
1583         goto unlock;
1584 }
1585 EXPORT_SYMBOL(register_netdevice_notifier);
1586
1587 /**
1588  *      unregister_netdevice_notifier - unregister a network notifier block
1589  *      @nb: notifier
1590  *
1591  *      Unregister a notifier previously registered by
1592  *      register_netdevice_notifier(). The notifier is unlinked into the
1593  *      kernel structures and may then be reused. A negative errno code
1594  *      is returned on a failure.
1595  *
1596  *      After unregistering unregister and down device events are synthesized
1597  *      for all devices on the device list to the removed notifier to remove
1598  *      the need for special case cleanup code.
1599  */
1600
1601 int unregister_netdevice_notifier(struct notifier_block *nb)
1602 {
1603         struct net_device *dev;
1604         struct net *net;
1605         int err;
1606
1607         rtnl_lock();
1608         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609         if (err)
1610                 goto unlock;
1611
1612         for_each_net(net) {
1613                 for_each_netdev(net, dev) {
1614                         if (dev->flags & IFF_UP) {
1615                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616                                                         dev);
1617                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618                         }
1619                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620                 }
1621         }
1622 unlock:
1623         rtnl_unlock();
1624         return err;
1625 }
1626 EXPORT_SYMBOL(unregister_netdevice_notifier);
1627
1628 /**
1629  *      call_netdevice_notifiers_info - call all network notifier blocks
1630  *      @val: value passed unmodified to notifier function
1631  *      @dev: net_device pointer passed unmodified to notifier function
1632  *      @info: notifier information data
1633  *
1634  *      Call all network notifier blocks.  Parameters and return value
1635  *      are as for raw_notifier_call_chain().
1636  */
1637
1638 static int call_netdevice_notifiers_info(unsigned long val,
1639                                          struct net_device *dev,
1640                                          struct netdev_notifier_info *info)
1641 {
1642         ASSERT_RTNL();
1643         netdev_notifier_info_init(info, dev);
1644         return raw_notifier_call_chain(&netdev_chain, val, info);
1645 }
1646
1647 /**
1648  *      call_netdevice_notifiers - call all network notifier blocks
1649  *      @val: value passed unmodified to notifier function
1650  *      @dev: net_device pointer passed unmodified to notifier function
1651  *
1652  *      Call all network notifier blocks.  Parameters and return value
1653  *      are as for raw_notifier_call_chain().
1654  */
1655
1656 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657 {
1658         struct netdev_notifier_info info;
1659
1660         return call_netdevice_notifiers_info(val, dev, &info);
1661 }
1662 EXPORT_SYMBOL(call_netdevice_notifiers);
1663
1664 #ifdef CONFIG_NET_INGRESS
1665 static struct static_key ingress_needed __read_mostly;
1666
1667 void net_inc_ingress_queue(void)
1668 {
1669         static_key_slow_inc(&ingress_needed);
1670 }
1671 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672
1673 void net_dec_ingress_queue(void)
1674 {
1675         static_key_slow_dec(&ingress_needed);
1676 }
1677 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678 #endif
1679
1680 #ifdef CONFIG_NET_EGRESS
1681 static struct static_key egress_needed __read_mostly;
1682
1683 void net_inc_egress_queue(void)
1684 {
1685         static_key_slow_inc(&egress_needed);
1686 }
1687 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688
1689 void net_dec_egress_queue(void)
1690 {
1691         static_key_slow_dec(&egress_needed);
1692 }
1693 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694 #endif
1695
1696 static struct static_key netstamp_needed __read_mostly;
1697 #ifdef HAVE_JUMP_LABEL
1698 /* We are not allowed to call static_key_slow_dec() from irq context
1699  * If net_disable_timestamp() is called from irq context, defer the
1700  * static_key_slow_dec() calls.
1701  */
1702 static atomic_t netstamp_needed_deferred;
1703 #endif
1704
1705 void net_enable_timestamp(void)
1706 {
1707 #ifdef HAVE_JUMP_LABEL
1708         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1709
1710         if (deferred) {
1711                 while (--deferred)
1712                         static_key_slow_dec(&netstamp_needed);
1713                 return;
1714         }
1715 #endif
1716         static_key_slow_inc(&netstamp_needed);
1717 }
1718 EXPORT_SYMBOL(net_enable_timestamp);
1719
1720 void net_disable_timestamp(void)
1721 {
1722 #ifdef HAVE_JUMP_LABEL
1723         if (in_interrupt()) {
1724                 atomic_inc(&netstamp_needed_deferred);
1725                 return;
1726         }
1727 #endif
1728         static_key_slow_dec(&netstamp_needed);
1729 }
1730 EXPORT_SYMBOL(net_disable_timestamp);
1731
1732 static inline void net_timestamp_set(struct sk_buff *skb)
1733 {
1734         skb->tstamp = 0;
1735         if (static_key_false(&netstamp_needed))
1736                 __net_timestamp(skb);
1737 }
1738
1739 #define net_timestamp_check(COND, SKB)                  \
1740         if (static_key_false(&netstamp_needed)) {               \
1741                 if ((COND) && !(SKB)->tstamp)   \
1742                         __net_timestamp(SKB);           \
1743         }                                               \
1744
1745 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1746 {
1747         unsigned int len;
1748
1749         if (!(dev->flags & IFF_UP))
1750                 return false;
1751
1752         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1753         if (skb->len <= len)
1754                 return true;
1755
1756         /* if TSO is enabled, we don't care about the length as the packet
1757          * could be forwarded without being segmented before
1758          */
1759         if (skb_is_gso(skb))
1760                 return true;
1761
1762         return false;
1763 }
1764 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1765
1766 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1767 {
1768         int ret = ____dev_forward_skb(dev, skb);
1769
1770         if (likely(!ret)) {
1771                 skb->protocol = eth_type_trans(skb, dev);
1772                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1773         }
1774
1775         return ret;
1776 }
1777 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1778
1779 /**
1780  * dev_forward_skb - loopback an skb to another netif
1781  *
1782  * @dev: destination network device
1783  * @skb: buffer to forward
1784  *
1785  * return values:
1786  *      NET_RX_SUCCESS  (no congestion)
1787  *      NET_RX_DROP     (packet was dropped, but freed)
1788  *
1789  * dev_forward_skb can be used for injecting an skb from the
1790  * start_xmit function of one device into the receive queue
1791  * of another device.
1792  *
1793  * The receiving device may be in another namespace, so
1794  * we have to clear all information in the skb that could
1795  * impact namespace isolation.
1796  */
1797 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1798 {
1799         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1800 }
1801 EXPORT_SYMBOL_GPL(dev_forward_skb);
1802
1803 static inline int deliver_skb(struct sk_buff *skb,
1804                               struct packet_type *pt_prev,
1805                               struct net_device *orig_dev)
1806 {
1807         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1808                 return -ENOMEM;
1809         atomic_inc(&skb->users);
1810         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1811 }
1812
1813 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1814                                           struct packet_type **pt,
1815                                           struct net_device *orig_dev,
1816                                           __be16 type,
1817                                           struct list_head *ptype_list)
1818 {
1819         struct packet_type *ptype, *pt_prev = *pt;
1820
1821         list_for_each_entry_rcu(ptype, ptype_list, list) {
1822                 if (ptype->type != type)
1823                         continue;
1824                 if (pt_prev)
1825                         deliver_skb(skb, pt_prev, orig_dev);
1826                 pt_prev = ptype;
1827         }
1828         *pt = pt_prev;
1829 }
1830
1831 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1832 {
1833         if (!ptype->af_packet_priv || !skb->sk)
1834                 return false;
1835
1836         if (ptype->id_match)
1837                 return ptype->id_match(ptype, skb->sk);
1838         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1839                 return true;
1840
1841         return false;
1842 }
1843
1844 /*
1845  *      Support routine. Sends outgoing frames to any network
1846  *      taps currently in use.
1847  */
1848
1849 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1850 {
1851         struct packet_type *ptype;
1852         struct sk_buff *skb2 = NULL;
1853         struct packet_type *pt_prev = NULL;
1854         struct list_head *ptype_list = &ptype_all;
1855
1856         rcu_read_lock();
1857 again:
1858         list_for_each_entry_rcu(ptype, ptype_list, list) {
1859                 /* Never send packets back to the socket
1860                  * they originated from - MvS (miquels@drinkel.ow.org)
1861                  */
1862                 if (skb_loop_sk(ptype, skb))
1863                         continue;
1864
1865                 if (pt_prev) {
1866                         deliver_skb(skb2, pt_prev, skb->dev);
1867                         pt_prev = ptype;
1868                         continue;
1869                 }
1870
1871                 /* need to clone skb, done only once */
1872                 skb2 = skb_clone(skb, GFP_ATOMIC);
1873                 if (!skb2)
1874                         goto out_unlock;
1875
1876                 net_timestamp_set(skb2);
1877
1878                 /* skb->nh should be correctly
1879                  * set by sender, so that the second statement is
1880                  * just protection against buggy protocols.
1881                  */
1882                 skb_reset_mac_header(skb2);
1883
1884                 if (skb_network_header(skb2) < skb2->data ||
1885                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1886                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1887                                              ntohs(skb2->protocol),
1888                                              dev->name);
1889                         skb_reset_network_header(skb2);
1890                 }
1891
1892                 skb2->transport_header = skb2->network_header;
1893                 skb2->pkt_type = PACKET_OUTGOING;
1894                 pt_prev = ptype;
1895         }
1896
1897         if (ptype_list == &ptype_all) {
1898                 ptype_list = &dev->ptype_all;
1899                 goto again;
1900         }
1901 out_unlock:
1902         if (pt_prev)
1903                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1904         rcu_read_unlock();
1905 }
1906 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1907
1908 /**
1909  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1910  * @dev: Network device
1911  * @txq: number of queues available
1912  *
1913  * If real_num_tx_queues is changed the tc mappings may no longer be
1914  * valid. To resolve this verify the tc mapping remains valid and if
1915  * not NULL the mapping. With no priorities mapping to this
1916  * offset/count pair it will no longer be used. In the worst case TC0
1917  * is invalid nothing can be done so disable priority mappings. If is
1918  * expected that drivers will fix this mapping if they can before
1919  * calling netif_set_real_num_tx_queues.
1920  */
1921 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1922 {
1923         int i;
1924         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1925
1926         /* If TC0 is invalidated disable TC mapping */
1927         if (tc->offset + tc->count > txq) {
1928                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1929                 dev->num_tc = 0;
1930                 return;
1931         }
1932
1933         /* Invalidated prio to tc mappings set to TC0 */
1934         for (i = 1; i < TC_BITMASK + 1; i++) {
1935                 int q = netdev_get_prio_tc_map(dev, i);
1936
1937                 tc = &dev->tc_to_txq[q];
1938                 if (tc->offset + tc->count > txq) {
1939                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1940                                 i, q);
1941                         netdev_set_prio_tc_map(dev, i, 0);
1942                 }
1943         }
1944 }
1945
1946 int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
1947 {
1948         if (dev->num_tc) {
1949                 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1950                 int i;
1951
1952                 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
1953                         if ((txq - tc->offset) < tc->count)
1954                                 return i;
1955                 }
1956
1957                 return -1;
1958         }
1959
1960         return 0;
1961 }
1962
1963 #ifdef CONFIG_XPS
1964 static DEFINE_MUTEX(xps_map_mutex);
1965 #define xmap_dereference(P)             \
1966         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1967
1968 static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
1969                              int tci, u16 index)
1970 {
1971         struct xps_map *map = NULL;
1972         int pos;
1973
1974         if (dev_maps)
1975                 map = xmap_dereference(dev_maps->cpu_map[tci]);
1976         if (!map)
1977                 return false;
1978
1979         for (pos = map->len; pos--;) {
1980                 if (map->queues[pos] != index)
1981                         continue;
1982
1983                 if (map->len > 1) {
1984                         map->queues[pos] = map->queues[--map->len];
1985                         break;
1986                 }
1987
1988                 RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
1989                 kfree_rcu(map, rcu);
1990                 return false;
1991         }
1992
1993         return true;
1994 }
1995
1996 static bool remove_xps_queue_cpu(struct net_device *dev,
1997                                  struct xps_dev_maps *dev_maps,
1998                                  int cpu, u16 offset, u16 count)
1999 {
2000         int num_tc = dev->num_tc ? : 1;
2001         bool active = false;
2002         int tci;
2003
2004         for (tci = cpu * num_tc; num_tc--; tci++) {
2005                 int i, j;
2006
2007                 for (i = count, j = offset; i--; j++) {
2008                         if (!remove_xps_queue(dev_maps, cpu, j))
2009                                 break;
2010                 }
2011
2012                 active |= i < 0;
2013         }
2014
2015         return active;
2016 }
2017
2018 static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2019                                    u16 count)
2020 {
2021         struct xps_dev_maps *dev_maps;
2022         int cpu, i;
2023         bool active = false;
2024
2025         mutex_lock(&xps_map_mutex);
2026         dev_maps = xmap_dereference(dev->xps_maps);
2027
2028         if (!dev_maps)
2029                 goto out_no_maps;
2030
2031         for_each_possible_cpu(cpu)
2032                 active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2033                                                offset, count);
2034
2035         if (!active) {
2036                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2037                 kfree_rcu(dev_maps, rcu);
2038         }
2039
2040         for (i = offset + (count - 1); count--; i--)
2041                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2042                                              NUMA_NO_NODE);
2043
2044 out_no_maps:
2045         mutex_unlock(&xps_map_mutex);
2046 }
2047
2048 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2049 {
2050         netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2051 }
2052
2053 static struct xps_map *expand_xps_map(struct xps_map *map,
2054                                       int cpu, u16 index)
2055 {
2056         struct xps_map *new_map;
2057         int alloc_len = XPS_MIN_MAP_ALLOC;
2058         int i, pos;
2059
2060         for (pos = 0; map && pos < map->len; pos++) {
2061                 if (map->queues[pos] != index)
2062                         continue;
2063                 return map;
2064         }
2065
2066         /* Need to add queue to this CPU's existing map */
2067         if (map) {
2068                 if (pos < map->alloc_len)
2069                         return map;
2070
2071                 alloc_len = map->alloc_len * 2;
2072         }
2073
2074         /* Need to allocate new map to store queue on this CPU's map */
2075         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2076                                cpu_to_node(cpu));
2077         if (!new_map)
2078                 return NULL;
2079
2080         for (i = 0; i < pos; i++)
2081                 new_map->queues[i] = map->queues[i];
2082         new_map->alloc_len = alloc_len;
2083         new_map->len = pos;
2084
2085         return new_map;
2086 }
2087
2088 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2089                         u16 index)
2090 {
2091         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2092         int i, cpu, tci, numa_node_id = -2;
2093         int maps_sz, num_tc = 1, tc = 0;
2094         struct xps_map *map, *new_map;
2095         bool active = false;
2096
2097         if (dev->num_tc) {
2098                 num_tc = dev->num_tc;
2099                 tc = netdev_txq_to_tc(dev, index);
2100                 if (tc < 0)
2101                         return -EINVAL;
2102         }
2103
2104         maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
2105         if (maps_sz < L1_CACHE_BYTES)
2106                 maps_sz = L1_CACHE_BYTES;
2107
2108         mutex_lock(&xps_map_mutex);
2109
2110         dev_maps = xmap_dereference(dev->xps_maps);
2111
2112         /* allocate memory for queue storage */
2113         for_each_cpu_and(cpu, cpu_online_mask, mask) {
2114                 if (!new_dev_maps)
2115                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2116                 if (!new_dev_maps) {
2117                         mutex_unlock(&xps_map_mutex);
2118                         return -ENOMEM;
2119                 }
2120
2121                 tci = cpu * num_tc + tc;
2122                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2123                                  NULL;
2124
2125                 map = expand_xps_map(map, cpu, index);
2126                 if (!map)
2127                         goto error;
2128
2129                 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2130         }
2131
2132         if (!new_dev_maps)
2133                 goto out_no_new_maps;
2134
2135         for_each_possible_cpu(cpu) {
2136                 /* copy maps belonging to foreign traffic classes */
2137                 for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2138                         /* fill in the new device map from the old device map */
2139                         map = xmap_dereference(dev_maps->cpu_map[tci]);
2140                         RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2141                 }
2142
2143                 /* We need to explicitly update tci as prevous loop
2144                  * could break out early if dev_maps is NULL.
2145                  */
2146                 tci = cpu * num_tc + tc;
2147
2148                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2149                         /* add queue to CPU maps */
2150                         int pos = 0;
2151
2152                         map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2153                         while ((pos < map->len) && (map->queues[pos] != index))
2154                                 pos++;
2155
2156                         if (pos == map->len)
2157                                 map->queues[map->len++] = index;
2158 #ifdef CONFIG_NUMA
2159                         if (numa_node_id == -2)
2160                                 numa_node_id = cpu_to_node(cpu);
2161                         else if (numa_node_id != cpu_to_node(cpu))
2162                                 numa_node_id = -1;
2163 #endif
2164                 } else if (dev_maps) {
2165                         /* fill in the new device map from the old device map */
2166                         map = xmap_dereference(dev_maps->cpu_map[tci]);
2167                         RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2168                 }
2169
2170                 /* copy maps belonging to foreign traffic classes */
2171                 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2172                         /* fill in the new device map from the old device map */
2173                         map = xmap_dereference(dev_maps->cpu_map[tci]);
2174                         RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2175                 }
2176         }
2177
2178         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2179
2180         /* Cleanup old maps */
2181         if (!dev_maps)
2182                 goto out_no_old_maps;
2183
2184         for_each_possible_cpu(cpu) {
2185                 for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2186                         new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2187                         map = xmap_dereference(dev_maps->cpu_map[tci]);
2188                         if (map && map != new_map)
2189                                 kfree_rcu(map, rcu);
2190                 }
2191         }
2192
2193         kfree_rcu(dev_maps, rcu);
2194
2195 out_no_old_maps:
2196         dev_maps = new_dev_maps;
2197         active = true;
2198
2199 out_no_new_maps:
2200         /* update Tx queue numa node */
2201         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2202                                      (numa_node_id >= 0) ? numa_node_id :
2203                                      NUMA_NO_NODE);
2204
2205         if (!dev_maps)
2206                 goto out_no_maps;
2207
2208         /* removes queue from unused CPUs */
2209         for_each_possible_cpu(cpu) {
2210                 for (i = tc, tci = cpu * num_tc; i--; tci++)
2211                         active |= remove_xps_queue(dev_maps, tci, index);
2212                 if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2213                         active |= remove_xps_queue(dev_maps, tci, index);
2214                 for (i = num_tc - tc, tci++; --i; tci++)
2215                         active |= remove_xps_queue(dev_maps, tci, index);
2216         }
2217
2218         /* free map if not active */
2219         if (!active) {
2220                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2221                 kfree_rcu(dev_maps, rcu);
2222         }
2223
2224 out_no_maps:
2225         mutex_unlock(&xps_map_mutex);
2226
2227         return 0;
2228 error:
2229         /* remove any maps that we added */
2230         for_each_possible_cpu(cpu) {
2231                 for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2232                         new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2233                         map = dev_maps ?
2234                               xmap_dereference(dev_maps->cpu_map[tci]) :
2235                               NULL;
2236                         if (new_map && new_map != map)
2237                                 kfree(new_map);
2238                 }
2239         }
2240
2241         mutex_unlock(&xps_map_mutex);
2242
2243         kfree(new_dev_maps);
2244         return -ENOMEM;
2245 }
2246 EXPORT_SYMBOL(netif_set_xps_queue);
2247
2248 #endif
2249 void netdev_reset_tc(struct net_device *dev)
2250 {
2251 #ifdef CONFIG_XPS
2252         netif_reset_xps_queues_gt(dev, 0);
2253 #endif
2254         dev->num_tc = 0;
2255         memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2256         memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2257 }
2258 EXPORT_SYMBOL(netdev_reset_tc);
2259
2260 int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2261 {
2262         if (tc >= dev->num_tc)
2263                 return -EINVAL;
2264
2265 #ifdef CONFIG_XPS
2266         netif_reset_xps_queues(dev, offset, count);
2267 #endif
2268         dev->tc_to_txq[tc].count = count;
2269         dev->tc_to_txq[tc].offset = offset;
2270         return 0;
2271 }
2272 EXPORT_SYMBOL(netdev_set_tc_queue);
2273
2274 int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2275 {
2276         if (num_tc > TC_MAX_QUEUE)
2277                 return -EINVAL;
2278
2279 #ifdef CONFIG_XPS
2280         netif_reset_xps_queues_gt(dev, 0);
2281 #endif
2282         dev->num_tc = num_tc;
2283         return 0;
2284 }
2285 EXPORT_SYMBOL(netdev_set_num_tc);
2286
2287 /*
2288  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2289  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2290  */
2291 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2292 {
2293         int rc;
2294
2295         if (txq < 1 || txq > dev->num_tx_queues)
2296                 return -EINVAL;
2297
2298         if (dev->reg_state == NETREG_REGISTERED ||
2299             dev->reg_state == NETREG_UNREGISTERING) {
2300                 ASSERT_RTNL();
2301
2302                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2303                                                   txq);
2304                 if (rc)
2305                         return rc;
2306
2307                 if (dev->num_tc)
2308                         netif_setup_tc(dev, txq);
2309
2310                 if (txq < dev->real_num_tx_queues) {
2311                         qdisc_reset_all_tx_gt(dev, txq);
2312 #ifdef CONFIG_XPS
2313                         netif_reset_xps_queues_gt(dev, txq);
2314 #endif
2315                 }
2316         }
2317
2318         dev->real_num_tx_queues = txq;
2319         return 0;
2320 }
2321 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2322
2323 #ifdef CONFIG_SYSFS
2324 /**
2325  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2326  *      @dev: Network device
2327  *      @rxq: Actual number of RX queues
2328  *
2329  *      This must be called either with the rtnl_lock held or before
2330  *      registration of the net device.  Returns 0 on success, or a
2331  *      negative error code.  If called before registration, it always
2332  *      succeeds.
2333  */
2334 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2335 {
2336         int rc;
2337
2338         if (rxq < 1 || rxq > dev->num_rx_queues)
2339                 return -EINVAL;
2340
2341         if (dev->reg_state == NETREG_REGISTERED) {
2342                 ASSERT_RTNL();
2343
2344                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2345                                                   rxq);
2346                 if (rc)
2347                         return rc;
2348         }
2349
2350         dev->real_num_rx_queues = rxq;
2351         return 0;
2352 }
2353 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2354 #endif
2355
2356 /**
2357  * netif_get_num_default_rss_queues - default number of RSS queues
2358  *
2359  * This routine should set an upper limit on the number of RSS queues
2360  * used by default by multiqueue devices.
2361  */
2362 int netif_get_num_default_rss_queues(void)
2363 {
2364         return is_kdump_kernel() ?
2365                 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2366 }
2367 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2368
2369 static void __netif_reschedule(struct Qdisc *q)
2370 {
2371         struct softnet_data *sd;
2372         unsigned long flags;
2373
2374         local_irq_save(flags);
2375         sd = this_cpu_ptr(&softnet_data);
2376         q->next_sched = NULL;
2377         *sd->output_queue_tailp = q;
2378         sd->output_queue_tailp = &q->next_sched;
2379         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2380         local_irq_restore(flags);
2381 }
2382
2383 void __netif_schedule(struct Qdisc *q)
2384 {
2385         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2386                 __netif_reschedule(q);
2387 }
2388 EXPORT_SYMBOL(__netif_schedule);
2389
2390 struct dev_kfree_skb_cb {
2391         enum skb_free_reason reason;
2392 };
2393
2394 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2395 {
2396         return (struct dev_kfree_skb_cb *)skb->cb;
2397 }
2398
2399 void netif_schedule_queue(struct netdev_queue *txq)
2400 {
2401         rcu_read_lock();
2402         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2403                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2404
2405                 __netif_schedule(q);
2406         }
2407         rcu_read_unlock();
2408 }
2409 EXPORT_SYMBOL(netif_schedule_queue);
2410
2411 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2412 {
2413         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2414                 struct Qdisc *q;
2415
2416                 rcu_read_lock();
2417                 q = rcu_dereference(dev_queue->qdisc);
2418                 __netif_schedule(q);
2419                 rcu_read_unlock();
2420         }
2421 }
2422 EXPORT_SYMBOL(netif_tx_wake_queue);
2423
2424 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2425 {
2426         unsigned long flags;
2427
2428         if (likely(atomic_read(&skb->users) == 1)) {
2429                 smp_rmb();
2430                 atomic_set(&skb->users, 0);
2431         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2432                 return;
2433         }
2434         get_kfree_skb_cb(skb)->reason = reason;
2435         local_irq_save(flags);
2436         skb->next = __this_cpu_read(softnet_data.completion_queue);
2437         __this_cpu_write(softnet_data.completion_queue, skb);
2438         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2439         local_irq_restore(flags);
2440 }
2441 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2442
2443 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2444 {
2445         if (in_irq() || irqs_disabled())
2446                 __dev_kfree_skb_irq(skb, reason);
2447         else
2448                 dev_kfree_skb(skb);
2449 }
2450 EXPORT_SYMBOL(__dev_kfree_skb_any);
2451
2452
2453 /**
2454  * netif_device_detach - mark device as removed
2455  * @dev: network device
2456  *
2457  * Mark device as removed from system and therefore no longer available.
2458  */
2459 void netif_device_detach(struct net_device *dev)
2460 {
2461         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2462             netif_running(dev)) {
2463                 netif_tx_stop_all_queues(dev);
2464         }
2465 }
2466 EXPORT_SYMBOL(netif_device_detach);
2467
2468 /**
2469  * netif_device_attach - mark device as attached
2470  * @dev: network device
2471  *
2472  * Mark device as attached from system and restart if needed.
2473  */
2474 void netif_device_attach(struct net_device *dev)
2475 {
2476         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2477             netif_running(dev)) {
2478                 netif_tx_wake_all_queues(dev);
2479                 __netdev_watchdog_up(dev);
2480         }
2481 }
2482 EXPORT_SYMBOL(netif_device_attach);
2483
2484 /*
2485  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2486  * to be used as a distribution range.
2487  */
2488 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2489                   unsigned int num_tx_queues)
2490 {
2491         u32 hash;
2492         u16 qoffset = 0;
2493         u16 qcount = num_tx_queues;
2494
2495         if (skb_rx_queue_recorded(skb)) {
2496                 hash = skb_get_rx_queue(skb);
2497                 while (unlikely(hash >= num_tx_queues))
2498                         hash -= num_tx_queues;
2499                 return hash;
2500         }
2501
2502         if (dev->num_tc) {
2503                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2504                 qoffset = dev->tc_to_txq[tc].offset;
2505                 qcount = dev->tc_to_txq[tc].count;
2506         }
2507
2508         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2509 }
2510 EXPORT_SYMBOL(__skb_tx_hash);
2511
2512 static void skb_warn_bad_offload(const struct sk_buff *skb)
2513 {
2514         static const netdev_features_t null_features;
2515         struct net_device *dev = skb->dev;
2516         const char *name = "";
2517
2518         if (!net_ratelimit())
2519                 return;
2520
2521         if (dev) {
2522                 if (dev->dev.parent)
2523                         name = dev_driver_string(dev->dev.parent);
2524                 else
2525                         name = netdev_name(dev);
2526         }
2527         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2528              "gso_type=%d ip_summed=%d\n",
2529              name, dev ? &dev->features : &null_features,
2530              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2531              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2532              skb_shinfo(skb)->gso_type, skb->ip_summed);
2533 }
2534
2535 /*
2536  * Invalidate hardware checksum when packet is to be mangled, and
2537  * complete checksum manually on outgoing path.
2538  */
2539 int skb_checksum_help(struct sk_buff *skb)
2540 {
2541         __wsum csum;
2542         int ret = 0, offset;
2543
2544         if (skb->ip_summed == CHECKSUM_COMPLETE)
2545                 goto out_set_summed;
2546
2547         if (unlikely(skb_shinfo(skb)->gso_size)) {
2548                 skb_warn_bad_offload(skb);
2549                 return -EINVAL;
2550         }
2551
2552         /* Before computing a checksum, we should make sure no frag could
2553          * be modified by an external entity : checksum could be wrong.
2554          */
2555         if (skb_has_shared_frag(skb)) {
2556                 ret = __skb_linearize(skb);
2557                 if (ret)
2558                         goto out;
2559         }
2560
2561         offset = skb_checksum_start_offset(skb);
2562         BUG_ON(offset >= skb_headlen(skb));
2563         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2564
2565         offset += skb->csum_offset;
2566         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2567
2568         if (skb_cloned(skb) &&
2569             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2570                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2571                 if (ret)
2572                         goto out;
2573         }
2574
2575         *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2576 out_set_summed:
2577         skb->ip_summed = CHECKSUM_NONE;
2578 out:
2579         return ret;
2580 }
2581 EXPORT_SYMBOL(skb_checksum_help);
2582
2583 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2584 {
2585         __be16 type = skb->protocol;
2586
2587         /* Tunnel gso handlers can set protocol to ethernet. */
2588         if (type == htons(ETH_P_TEB)) {
2589                 struct ethhdr *eth;
2590
2591                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2592                         return 0;
2593
2594                 eth = (struct ethhdr *)skb_mac_header(skb);
2595                 type = eth->h_proto;
2596         }
2597
2598         return __vlan_get_protocol(skb, type, depth);
2599 }
2600
2601 /**
2602  *      skb_mac_gso_segment - mac layer segmentation handler.
2603  *      @skb: buffer to segment
2604  *      @features: features for the output path (see dev->features)
2605  */
2606 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2607                                     netdev_features_t features)
2608 {
2609         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2610         struct packet_offload *ptype;
2611         int vlan_depth = skb->mac_len;
2612         __be16 type = skb_network_protocol(skb, &vlan_depth);
2613
2614         if (unlikely(!type))
2615                 return ERR_PTR(-EINVAL);
2616
2617         __skb_pull(skb, vlan_depth);
2618
2619         rcu_read_lock();
2620         list_for_each_entry_rcu(ptype, &offload_base, list) {
2621                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2622                         segs = ptype->callbacks.gso_segment(skb, features);
2623                         break;
2624                 }
2625         }
2626         rcu_read_unlock();
2627
2628         __skb_push(skb, skb->data - skb_mac_header(skb));
2629
2630         return segs;
2631 }
2632 EXPORT_SYMBOL(skb_mac_gso_segment);
2633
2634
2635 /* openvswitch calls this on rx path, so we need a different check.
2636  */
2637 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2638 {
2639         if (tx_path)
2640                 return skb->ip_summed != CHECKSUM_PARTIAL &&
2641                        skb->ip_summed != CHECKSUM_NONE;
2642
2643         return skb->ip_summed == CHECKSUM_NONE;
2644 }
2645
2646 /**
2647  *      __skb_gso_segment - Perform segmentation on skb.
2648  *      @skb: buffer to segment
2649  *      @features: features for the output path (see dev->features)
2650  *      @tx_path: whether it is called in TX path
2651  *
2652  *      This function segments the given skb and returns a list of segments.
2653  *
2654  *      It may return NULL if the skb requires no segmentation.  This is
2655  *      only possible when GSO is used for verifying header integrity.
2656  *
2657  *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2658  */
2659 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2660                                   netdev_features_t features, bool tx_path)
2661 {
2662         struct sk_buff *segs;
2663
2664         if (unlikely(skb_needs_check(skb, tx_path))) {
2665                 int err;
2666
2667                 /* We're going to init ->check field in TCP or UDP header */
2668                 err = skb_cow_head(skb, 0);
2669                 if (err < 0)
2670                         return ERR_PTR(err);
2671         }
2672
2673         /* Only report GSO partial support if it will enable us to
2674          * support segmentation on this frame without needing additional
2675          * work.
2676          */
2677         if (features & NETIF_F_GSO_PARTIAL) {
2678                 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2679                 struct net_device *dev = skb->dev;
2680
2681                 partial_features |= dev->features & dev->gso_partial_features;
2682                 if (!skb_gso_ok(skb, features | partial_features))
2683                         features &= ~NETIF_F_GSO_PARTIAL;
2684         }
2685
2686         BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2687                      sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2688
2689         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2690         SKB_GSO_CB(skb)->encap_level = 0;
2691
2692         skb_reset_mac_header(skb);
2693         skb_reset_mac_len(skb);
2694
2695         segs = skb_mac_gso_segment(skb, features);
2696
2697         if (unlikely(skb_needs_check(skb, tx_path)))
2698                 skb_warn_bad_offload(skb);
2699
2700         return segs;
2701 }
2702 EXPORT_SYMBOL(__skb_gso_segment);
2703
2704 /* Take action when hardware reception checksum errors are detected. */
2705 #ifdef CONFIG_BUG
2706 void netdev_rx_csum_fault(struct net_device *dev)
2707 {
2708         if (net_ratelimit()) {
2709                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2710                 dump_stack();
2711         }
2712 }
2713 EXPORT_SYMBOL(netdev_rx_csum_fault);
2714 #endif
2715
2716 /* Actually, we should eliminate this check as soon as we know, that:
2717  * 1. IOMMU is present and allows to map all the memory.
2718  * 2. No high memory really exists on this machine.
2719  */
2720
2721 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2722 {
2723 #ifdef CONFIG_HIGHMEM
2724         int i;
2725         if (!(dev->features & NETIF_F_HIGHDMA)) {
2726                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2727                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2728                         if (PageHighMem(skb_frag_page(frag)))
2729                                 return 1;
2730                 }
2731         }
2732
2733         if (PCI_DMA_BUS_IS_PHYS) {
2734                 struct device *pdev = dev->dev.parent;
2735
2736                 if (!pdev)
2737                         return 0;
2738                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2739                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2740                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2741                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2742                                 return 1;
2743                 }
2744         }
2745 #endif
2746         return 0;
2747 }
2748
2749 /* If MPLS offload request, verify we are testing hardware MPLS features
2750  * instead of standard features for the netdev.
2751  */
2752 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2753 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2754                                            netdev_features_t features,
2755                                            __be16 type)
2756 {
2757         if (eth_p_mpls(type))
2758                 features &= skb->dev->mpls_features;
2759
2760         return features;
2761 }
2762 #else
2763 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2764                                            netdev_features_t features,
2765                                            __be16 type)
2766 {
2767         return features;
2768 }
2769 #endif
2770
2771 static netdev_features_t harmonize_features(struct sk_buff *skb,
2772         netdev_features_t features)
2773 {
2774         int tmp;
2775         __be16 type;
2776
2777         type = skb_network_protocol(skb, &tmp);
2778         features = net_mpls_features(skb, features, type);
2779
2780         if (skb->ip_summed != CHECKSUM_NONE &&
2781             !can_checksum_protocol(features, type)) {
2782                 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2783         }
2784         if (illegal_highdma(skb->dev, skb))
2785                 features &= ~NETIF_F_SG;
2786
2787         return features;
2788 }
2789
2790 netdev_features_t passthru_features_check(struct sk_buff *skb,
2791                                           struct net_device *dev,
2792                                           netdev_features_t features)
2793 {
2794         return features;
2795 }
2796 EXPORT_SYMBOL(passthru_features_check);
2797
2798 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2799                                              struct net_device *dev,
2800                                              netdev_features_t features)
2801 {
2802         return vlan_features_check(skb, features);
2803 }
2804
2805 static netdev_features_t gso_features_check(const struct sk_buff *skb,
2806                                             struct net_device *dev,
2807                                             netdev_features_t features)
2808 {
2809         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2810
2811         if (gso_segs > dev->gso_max_segs)
2812                 return features & ~NETIF_F_GSO_MASK;
2813
2814         /* Support for GSO partial features requires software
2815          * intervention before we can actually process the packets
2816          * so we need to strip support for any partial features now
2817          * and we can pull them back in after we have partially
2818          * segmented the frame.
2819          */
2820         if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2821                 features &= ~dev->gso_partial_features;
2822
2823         /* Make sure to clear the IPv4 ID mangling feature if the
2824          * IPv4 header has the potential to be fragmented.
2825          */
2826         if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2827                 struct iphdr *iph = skb->encapsulation ?
2828                                     inner_ip_hdr(skb) : ip_hdr(skb);
2829
2830                 if (!(iph->frag_off & htons(IP_DF)))
2831                         features &= ~NETIF_F_TSO_MANGLEID;
2832         }
2833
2834         return features;
2835 }
2836
2837 netdev_features_t netif_skb_features(struct sk_buff *skb)
2838 {
2839         struct net_device *dev = skb->dev;
2840         netdev_features_t features = dev->features;
2841
2842         if (skb_is_gso(skb))
2843                 features = gso_features_check(skb, dev, features);
2844
2845         /* If encapsulation offload request, verify we are testing
2846          * hardware encapsulation features instead of standard
2847          * features for the netdev
2848          */
2849         if (skb->encapsulation)
2850                 features &= dev->hw_enc_features;
2851
2852         if (skb_vlan_tagged(skb))
2853                 features = netdev_intersect_features(features,
2854                                                      dev->vlan_features |
2855                                                      NETIF_F_HW_VLAN_CTAG_TX |
2856                                                      NETIF_F_HW_VLAN_STAG_TX);
2857
2858         if (dev->netdev_ops->ndo_features_check)
2859                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2860                                                                 features);
2861         else
2862                 features &= dflt_features_check(skb, dev, features);
2863
2864         return harmonize_features(skb, features);
2865 }
2866 EXPORT_SYMBOL(netif_skb_features);
2867
2868 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2869                     struct netdev_queue *txq, bool more)
2870 {
2871         unsigned int len;
2872         int rc;
2873
2874         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2875                 dev_queue_xmit_nit(skb, dev);
2876
2877         len = skb->len;
2878         trace_net_dev_start_xmit(skb, dev);
2879         rc = netdev_start_xmit(skb, dev, txq, more);
2880         trace_net_dev_xmit(skb, rc, dev, len);
2881
2882         return rc;
2883 }
2884
2885 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2886                                     struct netdev_queue *txq, int *ret)
2887 {
2888         struct sk_buff *skb = first;
2889         int rc = NETDEV_TX_OK;
2890
2891         while (skb) {
2892                 struct sk_buff *next = skb->next;
2893
2894                 skb->next = NULL;
2895                 rc = xmit_one(skb, dev, txq, next != NULL);
2896                 if (unlikely(!dev_xmit_complete(rc))) {
2897                         skb->next = next;
2898                         goto out;
2899                 }
2900
2901                 skb = next;
2902                 if (netif_xmit_stopped(txq) && skb) {
2903                         rc = NETDEV_TX_BUSY;
2904                         break;
2905                 }
2906         }
2907
2908 out:
2909         *ret = rc;
2910         return skb;
2911 }
2912
2913 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2914                                           netdev_features_t features)
2915 {
2916         if (skb_vlan_tag_present(skb) &&
2917             !vlan_hw_offload_capable(features, skb->vlan_proto))
2918                 skb = __vlan_hwaccel_push_inside(skb);
2919         return skb;
2920 }
2921
2922 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2923 {
2924         netdev_features_t features;
2925
2926         features = netif_skb_features(skb);
2927         skb = validate_xmit_vlan(skb, features);
2928         if (unlikely(!skb))
2929                 goto out_null;
2930
2931         if (netif_needs_gso(skb, features)) {
2932                 struct sk_buff *segs;
2933
2934                 segs = skb_gso_segment(skb, features);
2935                 if (IS_ERR(segs)) {
2936                         goto out_kfree_skb;
2937                 } else if (segs) {
2938                         consume_skb(skb);
2939                         skb = segs;
2940                 }
2941         } else {
2942                 if (skb_needs_linearize(skb, features) &&
2943                     __skb_linearize(skb))
2944                         goto out_kfree_skb;
2945
2946                 /* If packet is not checksummed and device does not
2947                  * support checksumming for this protocol, complete
2948                  * checksumming here.
2949                  */
2950                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2951                         if (skb->encapsulation)
2952                                 skb_set_inner_transport_header(skb,
2953                                                                skb_checksum_start_offset(skb));
2954                         else
2955                                 skb_set_transport_header(skb,
2956                                                          skb_checksum_start_offset(skb));
2957                         if (!(features & NETIF_F_CSUM_MASK) &&
2958                             skb_checksum_help(skb))
2959                                 goto out_kfree_skb;
2960                 }
2961         }
2962
2963         return skb;
2964
2965 out_kfree_skb:
2966         kfree_skb(skb);
2967 out_null:
2968         atomic_long_inc(&dev->tx_dropped);
2969         return NULL;
2970 }
2971
2972 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2973 {
2974         struct sk_buff *next, *head = NULL, *tail;
2975
2976         for (; skb != NULL; skb = next) {
2977                 next = skb->next;
2978                 skb->next = NULL;
2979
2980                 /* in case skb wont be segmented, point to itself */
2981                 skb->prev = skb;
2982
2983                 skb = validate_xmit_skb(skb, dev);
2984                 if (!skb)
2985                         continue;
2986
2987                 if (!head)
2988                         head = skb;
2989                 else
2990                         tail->next = skb;
2991                 /* If skb was segmented, skb->prev points to
2992                  * the last segment. If not, it still contains skb.
2993                  */
2994                 tail = skb->prev;
2995         }
2996         return head;
2997 }
2998 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
2999
3000 static void qdisc_pkt_len_init(struct sk_buff *skb)
3001 {
3002         const struct skb_shared_info *shinfo = skb_shinfo(skb);
3003
3004         qdisc_skb_cb(skb)->pkt_len = skb->len;
3005
3006         /* To get more precise estimation of bytes sent on wire,
3007          * we add to pkt_len the headers size of all segments
3008          */
3009         if (shinfo->gso_size)  {
3010                 unsigned int hdr_len;
3011                 u16 gso_segs = shinfo->gso_segs;
3012
3013                 /* mac layer + network layer */
3014                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3015
3016                 /* + transport layer */
3017                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3018                         hdr_len += tcp_hdrlen(skb);
3019                 else
3020                         hdr_len += sizeof(struct udphdr);
3021
3022                 if (shinfo->gso_type & SKB_GSO_DODGY)
3023                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3024                                                 shinfo->gso_size);
3025
3026                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3027         }
3028 }
3029
3030 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3031                                  struct net_device *dev,
3032                                  struct netdev_queue *txq)
3033 {
3034         spinlock_t *root_lock = qdisc_lock(q);
3035         struct sk_buff *to_free = NULL;
3036         bool contended;
3037         int rc;
3038
3039         qdisc_calculate_pkt_len(skb, q);
3040         /*
3041          * Heuristic to force contended enqueues to serialize on a
3042          * separate lock before trying to get qdisc main lock.
3043          * This permits qdisc->running owner to get the lock more
3044          * often and dequeue packets faster.
3045          */
3046         contended = qdisc_is_running(q);
3047         if (unlikely(contended))
3048                 spin_lock(&q->busylock);
3049
3050         spin_lock(root_lock);
3051         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3052                 __qdisc_drop(skb, &to_free);
3053                 rc = NET_XMIT_DROP;
3054         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3055                    qdisc_run_begin(q)) {
3056                 /*
3057                  * This is a work-conserving queue; there are no old skbs
3058                  * waiting to be sent out; and the qdisc is not running -
3059                  * xmit the skb directly.
3060                  */
3061
3062                 qdisc_bstats_update(q, skb);
3063
3064                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3065                         if (unlikely(contended)) {
3066                                 spin_unlock(&q->busylock);
3067                                 contended = false;
3068                         }
3069                         __qdisc_run(q);
3070                 } else
3071                         qdisc_run_end(q);
3072
3073                 rc = NET_XMIT_SUCCESS;
3074         } else {
3075                 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3076                 if (qdisc_run_begin(q)) {
3077                         if (unlikely(contended)) {
3078                                 spin_unlock(&q->busylock);
3079                                 contended = false;
3080                         }
3081                         __qdisc_run(q);
3082                 }
3083         }
3084         spin_unlock(root_lock);
3085         if (unlikely(to_free))
3086                 kfree_skb_list(to_free);
3087         if (unlikely(contended))
3088                 spin_unlock(&q->busylock);
3089         return rc;
3090 }
3091
3092 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3093 static void skb_update_prio(struct sk_buff *skb)
3094 {
3095         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3096
3097         if (!skb->priority && skb->sk && map) {
3098                 unsigned int prioidx =
3099                         sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3100
3101                 if (prioidx < map->priomap_len)
3102                         skb->priority = map->priomap[prioidx];
3103         }
3104 }
3105 #else
3106 #define skb_update_prio(skb)
3107 #endif
3108
3109 DEFINE_PER_CPU(int, xmit_recursion);
3110 EXPORT_SYMBOL(xmit_recursion);
3111
3112 /**
3113  *      dev_loopback_xmit - loop back @skb
3114  *      @net: network namespace this loopback is happening in
3115  *      @sk:  sk needed to be a netfilter okfn
3116  *      @skb: buffer to transmit
3117  */
3118 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3119 {
3120         skb_reset_mac_header(skb);
3121         __skb_pull(skb, skb_network_offset(skb));
3122         skb->pkt_type = PACKET_LOOPBACK;
3123         skb->ip_summed = CHECKSUM_UNNECESSARY;
3124         WARN_ON(!skb_dst(skb));
3125         skb_dst_force(skb);
3126         netif_rx_ni(skb);
3127         return 0;
3128 }
3129 EXPORT_SYMBOL(dev_loopback_xmit);
3130
3131 #ifdef CONFIG_NET_EGRESS
3132 static struct sk_buff *
3133 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3134 {
3135         struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3136         struct tcf_result cl_res;
3137
3138         if (!cl)
3139                 return skb;
3140
3141         /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3142         qdisc_bstats_cpu_update(cl->q, skb);
3143
3144         switch (tc_classify(skb, cl, &cl_res, false)) {
3145         case TC_ACT_OK:
3146         case TC_ACT_RECLASSIFY:
3147                 skb->tc_index = TC_H_MIN(cl_res.classid);
3148                 break;
3149         case TC_ACT_SHOT:
3150                 qdisc_qstats_cpu_drop(cl->q);
3151                 *ret = NET_XMIT_DROP;
3152                 kfree_skb(skb);
3153                 return NULL;
3154         case TC_ACT_STOLEN:
3155         case TC_ACT_QUEUED:
3156                 *ret = NET_XMIT_SUCCESS;
3157                 consume_skb(skb);
3158                 return NULL;
3159         case TC_ACT_REDIRECT:
3160                 /* No need to push/pop skb's mac_header here on egress! */
3161                 skb_do_redirect(skb);
3162                 *ret = NET_XMIT_SUCCESS;
3163                 return NULL;
3164         default:
3165                 break;
3166         }
3167
3168         return skb;
3169 }
3170 #endif /* CONFIG_NET_EGRESS */
3171
3172 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3173 {
3174 #ifdef CONFIG_XPS
3175         struct xps_dev_maps *dev_maps;
3176         struct xps_map *map;
3177         int queue_index = -1;
3178
3179         rcu_read_lock();
3180         dev_maps = rcu_dereference(dev->xps_maps);
3181         if (dev_maps) {
3182                 unsigned int tci = skb->sender_cpu - 1;
3183
3184                 if (dev->num_tc) {
3185                         tci *= dev->num_tc;
3186                         tci += netdev_get_prio_tc_map(dev, skb->priority);
3187                 }
3188
3189                 map = rcu_dereference(dev_maps->cpu_map[tci]);
3190                 if (map) {
3191                         if (map->len == 1)
3192                                 queue_index = map->queues[0];
3193                         else
3194                                 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3195                                                                            map->len)];
3196                         if (unlikely(queue_index >= dev->real_num_tx_queues))
3197                                 queue_index = -1;
3198                 }
3199         }
3200         rcu_read_unlock();
3201
3202         return queue_index;
3203 #else
3204         return -1;
3205 #endif
3206 }
3207
3208 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3209 {
3210         struct sock *sk = skb->sk;
3211         int queue_index = sk_tx_queue_get(sk);
3212
3213         if (queue_index < 0 || skb->ooo_okay ||
3214             queue_index >= dev->real_num_tx_queues) {
3215                 int new_index = get_xps_queue(dev, skb);
3216                 if (new_index < 0)
3217                         new_index = skb_tx_hash(dev, skb);
3218
3219                 if (queue_index != new_index && sk &&
3220                     sk_fullsock(sk) &&
3221                     rcu_access_pointer(sk->sk_dst_cache))
3222                         sk_tx_queue_set(sk, new_index);
3223
3224                 queue_index = new_index;
3225         }
3226
3227         return queue_index;
3228 }
3229
3230 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3231                                     struct sk_buff *skb,
3232                                     void *accel_priv)
3233 {
3234         int queue_index = 0;
3235
3236 #ifdef CONFIG_XPS
3237         u32 sender_cpu = skb->sender_cpu - 1;
3238
3239         if (sender_cpu >= (u32)NR_CPUS)
3240                 skb->sender_cpu = raw_smp_processor_id() + 1;
3241 #endif
3242
3243         if (dev->real_num_tx_queues != 1) {
3244                 const struct net_device_ops *ops = dev->netdev_ops;
3245                 if (ops->ndo_select_queue)
3246                         queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3247                                                             __netdev_pick_tx);
3248                 else
3249                         queue_index = __netdev_pick_tx(dev, skb);
3250
3251                 if (!accel_priv)
3252                         queue_index = netdev_cap_txqueue(dev, queue_index);
3253         }
3254
3255         skb_set_queue_mapping(skb, queue_index);
3256         return netdev_get_tx_queue(dev, queue_index);
3257 }
3258
3259 /**
3260  *      __dev_queue_xmit - transmit a buffer
3261  *      @skb: buffer to transmit
3262  *      @accel_priv: private data used for L2 forwarding offload
3263  *
3264  *      Queue a buffer for transmission to a network device. The caller must
3265  *      have set the device and priority and built the buffer before calling
3266  *      this function. The function can be called from an interrupt.
3267  *
3268  *      A negative errno code is returned on a failure. A success does not
3269  *      guarantee the frame will be transmitted as it may be dropped due
3270  *      to congestion or traffic shaping.
3271  *
3272  * -----------------------------------------------------------------------------------
3273  *      I notice this method can also return errors from the queue disciplines,
3274  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3275  *      be positive.
3276  *
3277  *      Regardless of the return value, the skb is consumed, so it is currently
3278  *      difficult to retry a send to this method.  (You can bump the ref count
3279  *      before sending to hold a reference for retry if you are careful.)
3280  *
3281  *      When calling this method, interrupts MUST be enabled.  This is because
3282  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3283  *          --BLG
3284  */
3285 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3286 {
3287         struct net_device *dev = skb->dev;
3288         struct netdev_queue *txq;
3289         struct Qdisc *q;
3290         int rc = -ENOMEM;
3291
3292         skb_reset_mac_header(skb);
3293
3294         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3295                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3296
3297         /* Disable soft irqs for various locks below. Also
3298          * stops preemption for RCU.
3299          */
3300         rcu_read_lock_bh();
3301
3302         skb_update_prio(skb);
3303
3304         qdisc_pkt_len_init(skb);
3305 #ifdef CONFIG_NET_CLS_ACT
3306         skb->tc_at_ingress = 0;
3307 # ifdef CONFIG_NET_EGRESS
3308         if (static_key_false(&egress_needed)) {
3309                 skb = sch_handle_egress(skb, &rc, dev);
3310                 if (!skb)
3311                         goto out;
3312         }
3313 # endif
3314 #endif
3315         /* If device/qdisc don't need skb->dst, release it right now while
3316          * its hot in this cpu cache.
3317          */
3318         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3319                 skb_dst_drop(skb);
3320         else
3321                 skb_dst_force(skb);
3322
3323         txq = netdev_pick_tx(dev, skb, accel_priv);
3324         q = rcu_dereference_bh(txq->qdisc);
3325
3326         trace_net_dev_queue(skb);
3327         if (q->enqueue) {
3328                 rc = __dev_xmit_skb(skb, q, dev, txq);
3329                 goto out;
3330         }
3331
3332         /* The device has no queue. Common case for software devices:
3333            loopback, all the sorts of tunnels...
3334
3335            Really, it is unlikely that netif_tx_lock protection is necessary
3336            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3337            counters.)
3338            However, it is possible, that they rely on protection
3339            made by us here.
3340
3341            Check this and shot the lock. It is not prone from deadlocks.
3342            Either shot noqueue qdisc, it is even simpler 8)
3343          */
3344         if (dev->flags & IFF_UP) {
3345                 int cpu = smp_processor_id(); /* ok because BHs are off */
3346
3347                 if (txq->xmit_lock_owner != cpu) {
3348                         if (unlikely(__this_cpu_read(xmit_recursion) >
3349                                      XMIT_RECURSION_LIMIT))
3350                                 goto recursion_alert;
3351
3352                         skb = validate_xmit_skb(skb, dev);
3353                         if (!skb)
3354                                 goto out;
3355
3356                         HARD_TX_LOCK(dev, txq, cpu);
3357
3358                         if (!netif_xmit_stopped(txq)) {
3359                                 __this_cpu_inc(xmit_recursion);
3360                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3361                                 __this_cpu_dec(xmit_recursion);
3362                                 if (dev_xmit_complete(rc)) {
3363                                         HARD_TX_UNLOCK(dev, txq);
3364                                         goto out;
3365                                 }
3366                         }
3367                         HARD_TX_UNLOCK(dev, txq);
3368                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3369                                              dev->name);
3370                 } else {
3371                         /* Recursion is detected! It is possible,
3372                          * unfortunately
3373                          */
3374 recursion_alert:
3375                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3376                                              dev->name);
3377                 }
3378         }
3379
3380         rc = -ENETDOWN;
3381         rcu_read_unlock_bh();
3382
3383         atomic_long_inc(&dev->tx_dropped);
3384         kfree_skb_list(skb);
3385         return rc;
3386 out:
3387         rcu_read_unlock_bh();
3388         return rc;
3389 }
3390
3391 int dev_queue_xmit(struct sk_buff *skb)
3392 {
3393         return __dev_queue_xmit(skb, NULL);
3394 }
3395 EXPORT_SYMBOL(dev_queue_xmit);
3396
3397 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3398 {
3399         return __dev_queue_xmit(skb, accel_priv);
3400 }
3401 EXPORT_SYMBOL(dev_queue_xmit_accel);
3402
3403
3404 /*=======================================================================
3405                         Receiver routines
3406   =======================================================================*/
3407
3408 int netdev_max_backlog __read_mostly = 1000;
3409 EXPORT_SYMBOL(netdev_max_backlog);
3410
3411 int netdev_tstamp_prequeue __read_mostly = 1;
3412 int netdev_budget __read_mostly = 300;
3413 int weight_p __read_mostly = 64;           /* old backlog weight */
3414 int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
3415 int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
3416 int dev_rx_weight __read_mostly = 64;
3417 int dev_tx_weight __read_mostly = 64;
3418
3419 /* Called with irq disabled */
3420 static inline void ____napi_schedule(struct softnet_data *sd,
3421                                      struct napi_struct *napi)
3422 {
3423         list_add_tail(&napi->poll_list, &sd->poll_list);
3424         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3425 }
3426
3427 #ifdef CONFIG_RPS
3428
3429 /* One global table that all flow-based protocols share. */
3430 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3431 EXPORT_SYMBOL(rps_sock_flow_table);
3432 u32 rps_cpu_mask __read_mostly;
3433 EXPORT_SYMBOL(rps_cpu_mask);
3434
3435 struct static_key rps_needed __read_mostly;
3436 EXPORT_SYMBOL(rps_needed);
3437 struct static_key rfs_needed __read_mostly;
3438 EXPORT_SYMBOL(rfs_needed);
3439
3440 static struct rps_dev_flow *
3441 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3442             struct rps_dev_flow *rflow, u16 next_cpu)
3443 {
3444         if (next_cpu < nr_cpu_ids) {
3445 #ifdef CONFIG_RFS_ACCEL
3446                 struct netdev_rx_queue *rxqueue;
3447                 struct rps_dev_flow_table *flow_table;
3448                 struct rps_dev_flow *old_rflow;
3449                 u32 flow_id;
3450                 u16 rxq_index;
3451                 int rc;
3452
3453                 /* Should we steer this flow to a different hardware queue? */
3454                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3455                     !(dev->features & NETIF_F_NTUPLE))
3456                         goto out;
3457                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3458                 if (rxq_index == skb_get_rx_queue(skb))
3459                         goto out;
3460
3461                 rxqueue = dev->_rx + rxq_index;
3462                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3463                 if (!flow_table)
3464                         goto out;
3465                 flow_id = skb_get_hash(skb) & flow_table->mask;
3466                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3467                                                         rxq_index, flow_id);
3468                 if (rc < 0)
3469                         goto out;
3470                 old_rflow = rflow;
3471                 rflow = &flow_table->flows[flow_id];
3472                 rflow->filter = rc;
3473                 if (old_rflow->filter == rflow->filter)
3474                         old_rflow->filter = RPS_NO_FILTER;
3475         out:
3476 #endif
3477                 rflow->last_qtail =
3478                         per_cpu(softnet_data, next_cpu).input_queue_head;
3479         }
3480
3481         rflow->cpu = next_cpu;
3482         return rflow;
3483 }
3484
3485 /*
3486  * get_rps_cpu is called from netif_receive_skb and returns the target
3487  * CPU from the RPS map of the receiving queue for a given skb.
3488  * rcu_read_lock must be held on entry.
3489  */
3490 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3491                        struct rps_dev_flow **rflowp)
3492 {
3493         const struct rps_sock_flow_table *sock_flow_table;
3494         struct netdev_rx_queue *rxqueue = dev->_rx;
3495         struct rps_dev_flow_table *flow_table;
3496         struct rps_map *map;
3497         int cpu = -1;
3498         u32 tcpu;
3499         u32 hash;
3500
3501         if (skb_rx_queue_recorded(skb)) {
3502                 u16 index = skb_get_rx_queue(skb);
3503
3504                 if (unlikely(index >= dev->real_num_rx_queues)) {
3505                         WARN_ONCE(dev->real_num_rx_queues > 1,
3506                                   "%s received packet on queue %u, but number "
3507                                   "of RX queues is %u\n",
3508                                   dev->name, index, dev->real_num_rx_queues);
3509                         goto done;
3510                 }
3511                 rxqueue += index;
3512         }
3513
3514         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3515
3516         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3517         map = rcu_dereference(rxqueue->rps_map);
3518         if (!flow_table && !map)
3519                 goto done;
3520
3521         skb_reset_network_header(skb);
3522         hash = skb_get_hash(skb);
3523         if (!hash)
3524                 goto done;
3525
3526         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3527         if (flow_table && sock_flow_table) {
3528                 struct rps_dev_flow *rflow;
3529                 u32 next_cpu;
3530                 u32 ident;
3531
3532                 /* First check into global flow table if there is a match */
3533                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3534                 if ((ident ^ hash) & ~rps_cpu_mask)
3535                         goto try_rps;
3536
3537                 next_cpu = ident & rps_cpu_mask;
3538
3539                 /* OK, now we know there is a match,
3540                  * we can look at the local (per receive queue) flow table
3541                  */
3542                 rflow = &flow_table->flows[hash & flow_table->mask];
3543                 tcpu = rflow->cpu;
3544
3545                 /*
3546                  * If the desired CPU (where last recvmsg was done) is
3547                  * different from current CPU (one in the rx-queue flow
3548                  * table entry), switch if one of the following holds:
3549                  *   - Current CPU is unset (>= nr_cpu_ids).
3550                  *   - Current CPU is offline.
3551                  *   - The current CPU's queue tail has advanced beyond the
3552                  *     last packet that was enqueued using this table entry.
3553                  *     This guarantees that all previous packets for the flow
3554                  *     have been dequeued, thus preserving in order delivery.
3555                  */
3556                 if (unlikely(tcpu != next_cpu) &&
3557                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3558                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3559                       rflow->last_qtail)) >= 0)) {
3560                         tcpu = next_cpu;
3561                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3562                 }
3563
3564                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3565                         *rflowp = rflow;
3566                         cpu = tcpu;
3567                         goto done;
3568                 }
3569         }
3570
3571 try_rps:
3572
3573         if (map) {
3574                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3575                 if (cpu_online(tcpu)) {
3576                         cpu = tcpu;
3577                         goto done;
3578                 }
3579         }
3580
3581 done:
3582         return cpu;
3583 }
3584
3585 #ifdef CONFIG_RFS_ACCEL
3586
3587 /**
3588  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3589  * @dev: Device on which the filter was set
3590  * @rxq_index: RX queue index
3591  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3592  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3593  *
3594  * Drivers that implement ndo_rx_flow_steer() should periodically call
3595  * this function for each installed filter and remove the filters for
3596  * which it returns %true.
3597  */
3598 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3599                          u32 flow_id, u16 filter_id)
3600 {
3601         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3602         struct rps_dev_flow_table *flow_table;
3603         struct rps_dev_flow *rflow;
3604         bool expire = true;
3605         unsigned int cpu;
3606
3607         rcu_read_lock();
3608         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3609         if (flow_table && flow_id <= flow_table->mask) {
3610                 rflow = &flow_table->flows[flow_id];
3611                 cpu = ACCESS_ONCE(rflow->cpu);
3612                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3613                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3614                            rflow->last_qtail) <
3615                      (int)(10 * flow_table->mask)))
3616                         expire = false;
3617         }
3618         rcu_read_unlock();
3619         return expire;
3620 }
3621 EXPORT_SYMBOL(rps_may_expire_flow);
3622
3623 #endif /* CONFIG_RFS_ACCEL */
3624
3625 /* Called from hardirq (IPI) context */
3626 static void rps_trigger_softirq(void *data)
3627 {
3628         struct softnet_data *sd = data;
3629
3630         ____napi_schedule(sd, &sd->backlog);
3631         sd->received_rps++;
3632 }
3633
3634 #endif /* CONFIG_RPS */
3635
3636 /*
3637  * Check if this softnet_data structure is another cpu one
3638  * If yes, queue it to our IPI list and return 1
3639  * If no, return 0
3640  */
3641 static int rps_ipi_queued(struct softnet_data *sd)
3642 {
3643 #ifdef CONFIG_RPS
3644         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3645
3646         if (sd != mysd) {
3647                 sd->rps_ipi_next = mysd->rps_ipi_list;
3648                 mysd->rps_ipi_list = sd;
3649
3650                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3651                 return 1;
3652         }
3653 #endif /* CONFIG_RPS */
3654         return 0;
3655 }
3656
3657 #ifdef CONFIG_NET_FLOW_LIMIT
3658 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3659 #endif
3660
3661 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3662 {
3663 #ifdef CONFIG_NET_FLOW_LIMIT
3664         struct sd_flow_limit *fl;
3665         struct softnet_data *sd;
3666         unsigned int old_flow, new_flow;
3667
3668         if (qlen < (netdev_max_backlog >> 1))
3669                 return false;
3670
3671         sd = this_cpu_ptr(&softnet_data);
3672
3673         rcu_read_lock();
3674         fl = rcu_dereference(sd->flow_limit);
3675         if (fl) {
3676                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3677                 old_flow = fl->history[fl->history_head];
3678                 fl->history[fl->history_head] = new_flow;
3679
3680                 fl->history_head++;
3681                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3682
3683                 if (likely(fl->buckets[old_flow]))
3684                         fl->buckets[old_flow]--;
3685
3686                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3687                         fl->count++;
3688                         rcu_read_unlock();
3689                         return true;
3690                 }
3691         }
3692         rcu_read_unlock();
3693 #endif
3694         return false;
3695 }
3696
3697 /*
3698  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3699  * queue (may be a remote CPU queue).
3700  */
3701 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3702                               unsigned int *qtail)
3703 {
3704         struct softnet_data *sd;
3705         unsigned long flags;
3706         unsigned int qlen;
3707
3708         sd = &per_cpu(softnet_data, cpu);
3709
3710         local_irq_save(flags);
3711
3712         rps_lock(sd);
3713         if (!netif_running(skb->dev))
3714                 goto drop;
3715         qlen = skb_queue_len(&sd->input_pkt_queue);
3716         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3717                 if (qlen) {
3718 enqueue:
3719                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3720                         input_queue_tail_incr_save(sd, qtail);
3721                         rps_unlock(sd);
3722                         local_irq_restore(flags);
3723                         return NET_RX_SUCCESS;
3724                 }
3725
3726                 /* Schedule NAPI for backlog device
3727                  * We can use non atomic operation since we own the queue lock
3728                  */
3729                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3730                         if (!rps_ipi_queued(sd))
3731                                 ____napi_schedule(sd, &sd->backlog);
3732                 }
3733                 goto enqueue;
3734         }
3735
3736 drop:
3737         sd->dropped++;
3738         rps_unlock(sd);
3739
3740         local_irq_restore(flags);
3741
3742         atomic_long_inc(&skb->dev->rx_dropped);
3743         kfree_skb(skb);
3744         return NET_RX_DROP;
3745 }
3746
3747 static int netif_rx_internal(struct sk_buff *skb)
3748 {
3749         int ret;
3750
3751         net_timestamp_check(netdev_tstamp_prequeue, skb);
3752
3753         trace_netif_rx(skb);
3754 #ifdef CONFIG_RPS
3755         if (static_key_false(&rps_needed)) {
3756                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3757                 int cpu;
3758
3759                 preempt_disable();
3760                 rcu_read_lock();
3761
3762                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3763                 if (cpu < 0)
3764                         cpu = smp_processor_id();
3765
3766                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3767
3768                 rcu_read_unlock();
3769                 preempt_enable();
3770         } else
3771 #endif
3772         {
3773                 unsigned int qtail;
3774                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3775                 put_cpu();
3776         }
3777         return ret;
3778 }
3779
3780 /**
3781  *      netif_rx        -       post buffer to the network code
3782  *      @skb: buffer to post
3783  *
3784  *      This function receives a packet from a device driver and queues it for
3785  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3786  *      may be dropped during processing for congestion control or by the
3787  *      protocol layers.
3788  *
3789  *      return values:
3790  *      NET_RX_SUCCESS  (no congestion)
3791  *      NET_RX_DROP     (packet was dropped)
3792  *
3793  */
3794
3795 int netif_rx(struct sk_buff *skb)
3796 {
3797         trace_netif_rx_entry(skb);
3798
3799         return netif_rx_internal(skb);
3800 }
3801 EXPORT_SYMBOL(netif_rx);
3802
3803 int netif_rx_ni(struct sk_buff *skb)
3804 {
3805         int err;
3806
3807         trace_netif_rx_ni_entry(skb);
3808
3809         preempt_disable();
3810         err = netif_rx_internal(skb);
3811         if (local_softirq_pending())
3812                 do_softirq();
3813         preempt_enable();
3814
3815         return err;
3816 }
3817 EXPORT_SYMBOL(netif_rx_ni);
3818
3819 static __latent_entropy void net_tx_action(struct softirq_action *h)
3820 {
3821         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3822
3823         if (sd->completion_queue) {
3824                 struct sk_buff *clist;
3825
3826                 local_irq_disable();
3827                 clist = sd->completion_queue;
3828                 sd->completion_queue = NULL;
3829                 local_irq_enable();
3830
3831                 while (clist) {
3832                         struct sk_buff *skb = clist;
3833                         clist = clist->next;
3834
3835                         WARN_ON(atomic_read(&skb->users));
3836                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3837                                 trace_consume_skb(skb);
3838                         else
3839                                 trace_kfree_skb(skb, net_tx_action);
3840
3841                         if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3842                                 __kfree_skb(skb);
3843                         else
3844                                 __kfree_skb_defer(skb);
3845                 }
3846
3847                 __kfree_skb_flush();
3848         }
3849
3850         if (sd->output_queue) {
3851                 struct Qdisc *head;
3852
3853                 local_irq_disable();
3854                 head = sd->output_queue;
3855                 sd->output_queue = NULL;
3856                 sd->output_queue_tailp = &sd->output_queue;
3857                 local_irq_enable();
3858
3859                 while (head) {
3860                         struct Qdisc *q = head;
3861                         spinlock_t *root_lock;
3862
3863                         head = head->next_sched;
3864
3865                         root_lock = qdisc_lock(q);
3866                         spin_lock(root_lock);
3867                         /* We need to make sure head->next_sched is read
3868                          * before clearing __QDISC_STATE_SCHED
3869                          */
3870                         smp_mb__before_atomic();
3871                         clear_bit(__QDISC_STATE_SCHED, &q->state);
3872                         qdisc_run(q);
3873                         spin_unlock(root_lock);
3874                 }
3875         }
3876 }
3877
3878 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3879 /* This hook is defined here for ATM LANE */
3880 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3881                              unsigned char *addr) __read_mostly;
3882 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3883 #endif
3884
3885 static inline struct sk_buff *
3886 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3887                    struct net_device *orig_dev)
3888 {
3889 #ifdef CONFIG_NET_CLS_ACT
3890         struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3891         struct tcf_result cl_res;
3892
3893         /* If there's at least one ingress present somewhere (so
3894          * we get here via enabled static key), remaining devices
3895          * that are not configured with an ingress qdisc will bail
3896          * out here.
3897          */
3898         if (!cl)
3899                 return skb;
3900         if (*pt_prev) {
3901                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3902                 *pt_prev = NULL;
3903         }
3904
3905         qdisc_skb_cb(skb)->pkt_len = skb->len;
3906         skb->tc_at_ingress = 1;
3907         qdisc_bstats_cpu_update(cl->q, skb);
3908
3909         switch (tc_classify(skb, cl, &cl_res, false)) {
3910         case TC_ACT_OK:
3911         case TC_ACT_RECLASSIFY:
3912                 skb->tc_index = TC_H_MIN(cl_res.classid);
3913                 break;
3914         case TC_ACT_SHOT:
3915                 qdisc_qstats_cpu_drop(cl->q);
3916                 kfree_skb(skb);
3917                 return NULL;
3918         case TC_ACT_STOLEN:
3919         case TC_ACT_QUEUED:
3920                 consume_skb(skb);
3921                 return NULL;
3922         case TC_ACT_REDIRECT:
3923                 /* skb_mac_header check was done by cls/act_bpf, so
3924                  * we can safely push the L2 header back before
3925                  * redirecting to another netdev
3926                  */
3927                 __skb_push(skb, skb->mac_len);
3928                 skb_do_redirect(skb);
3929                 return NULL;
3930         default:
3931                 break;
3932         }
3933 #endif /* CONFIG_NET_CLS_ACT */
3934         return skb;
3935 }
3936
3937 /**
3938  *      netdev_is_rx_handler_busy - check if receive handler is registered
3939  *      @dev: device to check
3940  *
3941  *      Check if a receive handler is already registered for a given device.
3942  *      Return true if there one.
3943  *
3944  *      The caller must hold the rtnl_mutex.
3945  */
3946 bool netdev_is_rx_handler_busy(struct net_device *dev)
3947 {
3948         ASSERT_RTNL();
3949         return dev && rtnl_dereference(dev->rx_handler);
3950 }
3951 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3952
3953 /**
3954  *      netdev_rx_handler_register - register receive handler
3955  *      @dev: device to register a handler for
3956  *      @rx_handler: receive handler to register
3957  *      @rx_handler_data: data pointer that is used by rx handler
3958  *
3959  *      Register a receive handler for a device. This handler will then be
3960  *      called from __netif_receive_skb. A negative errno code is returned
3961  *      on a failure.
3962  *
3963  *      The caller must hold the rtnl_mutex.
3964  *
3965  *      For a general description of rx_handler, see enum rx_handler_result.
3966  */
3967 int netdev_rx_handler_register(struct net_device *dev,
3968                                rx_handler_func_t *rx_handler,
3969                                void *rx_handler_data)
3970 {
3971         if (netdev_is_rx_handler_busy(dev))
3972                 return -EBUSY;
3973
3974         /* Note: rx_handler_data must be set before rx_handler */
3975         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3976         rcu_assign_pointer(dev->rx_handler, rx_handler);
3977
3978         return 0;
3979 }
3980 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3981
3982 /**
3983  *      netdev_rx_handler_unregister - unregister receive handler
3984  *      @dev: device to unregister a handler from
3985  *
3986  *      Unregister a receive handler from a device.
3987  *
3988  *      The caller must hold the rtnl_mutex.
3989  */
3990 void netdev_rx_handler_unregister(struct net_device *dev)
3991 {
3992
3993         ASSERT_RTNL();
3994         RCU_INIT_POINTER(dev->rx_handler, NULL);
3995         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3996          * section has a guarantee to see a non NULL rx_handler_data
3997          * as well.
3998          */
3999         synchronize_net();
4000         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4001 }
4002 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4003
4004 /*
4005  * Limit the use of PFMEMALLOC reserves to those protocols that implement
4006  * the special handling of PFMEMALLOC skbs.
4007  */
4008 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4009 {
4010         switch (skb->protocol) {
4011         case htons(ETH_P_ARP):
4012         case htons(ETH_P_IP):
4013         case htons(ETH_P_IPV6):
4014         case htons(ETH_P_8021Q):
4015         case htons(ETH_P_8021AD):
4016                 return true;
4017         default:
4018                 return false;
4019         }
4020 }
4021
4022 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4023                              int *ret, struct net_device *orig_dev)
4024 {
4025 #ifdef CONFIG_NETFILTER_INGRESS
4026         if (nf_hook_ingress_active(skb)) {
4027                 int ingress_retval;
4028
4029                 if (*pt_prev) {
4030                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
4031                         *pt_prev = NULL;
4032                 }
4033
4034                 rcu_read_lock();
4035                 ingress_retval = nf_hook_ingress(skb);
4036                 rcu_read_unlock();
4037                 return ingress_retval;
4038         }
4039 #endif /* CONFIG_NETFILTER_INGRESS */
4040         return 0;
4041 }
4042
4043 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4044 {
4045         struct packet_type *ptype, *pt_prev;
4046         rx_handler_func_t *rx_handler;
4047         struct net_device *orig_dev;
4048         bool deliver_exact = false;
4049         int ret = NET_RX_DROP;
4050         __be16 type;
4051
4052         net_timestamp_check(!netdev_tstamp_prequeue, skb);
4053
4054         trace_netif_receive_skb(skb);
4055
4056         orig_dev = skb->dev;
4057
4058         skb_reset_network_header(skb);
4059         if (!skb_transport_header_was_set(skb))
4060                 skb_reset_transport_header(skb);
4061         skb_reset_mac_len(skb);
4062
4063         pt_prev = NULL;
4064
4065 another_round:
4066         skb->skb_iif = skb->dev->ifindex;
4067
4068         __this_cpu_inc(softnet_data.processed);
4069
4070         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4071             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4072                 skb = skb_vlan_untag(skb);
4073                 if (unlikely(!skb))
4074                         goto out;
4075         }
4076
4077         if (skb_skip_tc_classify(skb))
4078                 goto skip_classify;
4079
4080         if (pfmemalloc)
4081                 goto skip_taps;
4082
4083         list_for_each_entry_rcu(ptype, &ptype_all, list) {
4084                 if (pt_prev)
4085                         ret = deliver_skb(skb, pt_prev, orig_dev);
4086                 pt_prev = ptype;
4087         }
4088
4089         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4090                 if (pt_prev)
4091                         ret = deliver_skb(skb, pt_prev, orig_dev);
4092                 pt_prev = ptype;
4093         }
4094
4095 skip_taps:
4096 #ifdef CONFIG_NET_INGRESS
4097         if (static_key_false(&ingress_needed)) {
4098                 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4099                 if (!skb)
4100                         goto out;
4101
4102                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4103                         goto out;
4104         }
4105 #endif
4106         skb_reset_tc(skb);
4107 skip_classify:
4108         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4109                 goto drop;
4110
4111         if (skb_vlan_tag_present(skb)) {
4112                 if (pt_prev) {
4113                         ret = deliver_skb(skb, pt_prev, orig_dev);
4114                         pt_prev = NULL;
4115                 }
4116                 if (vlan_do_receive(&skb))
4117                         goto another_round;
4118                 else if (unlikely(!skb))
4119                         goto out;
4120         }
4121
4122         rx_handler = rcu_dereference(skb->dev->rx_handler);
4123         if (rx_handler) {
4124                 if (pt_prev) {
4125                         ret = deliver_skb(skb, pt_prev, orig_dev);
4126                         pt_prev = NULL;
4127                 }
4128                 switch (rx_handler(&skb)) {
4129                 case RX_HANDLER_CONSUMED:
4130                         ret = NET_RX_SUCCESS;
4131                         goto out;
4132                 case RX_HANDLER_ANOTHER:
4133                         goto another_round;
4134                 case RX_HANDLER_EXACT:
4135                         deliver_exact = true;
4136                 case RX_HANDLER_PASS:
4137                         break;
4138                 default:
4139                         BUG();
4140                 }
4141         }
4142
4143         if (unlikely(skb_vlan_tag_present(skb))) {
4144                 if (skb_vlan_tag_get_id(skb))
4145                         skb->pkt_type = PACKET_OTHERHOST;
4146                 /* Note: we might in the future use prio bits
4147                  * and set skb->priority like in vlan_do_receive()
4148                  * For the time being, just ignore Priority Code Point
4149                  */
4150                 skb->vlan_tci = 0;
4151         }
4152
4153         type = skb->protocol;
4154
4155         /* deliver only exact match when indicated */
4156         if (likely(!deliver_exact)) {
4157                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4158                                        &ptype_base[ntohs(type) &
4159                                                    PTYPE_HASH_MASK]);
4160         }
4161
4162         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4163                                &orig_dev->ptype_specific);
4164
4165         if (unlikely(skb->dev != orig_dev)) {
4166                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4167                                        &skb->dev->ptype_specific);
4168         }
4169
4170         if (pt_prev) {
4171                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4172                         goto drop;
4173                 else
4174                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4175         } else {
4176 drop:
4177                 if (!deliver_exact)
4178                         atomic_long_inc(&skb->dev->rx_dropped);
4179                 else
4180                         atomic_long_inc(&skb->dev->rx_nohandler);
4181                 kfree_skb(skb);
4182                 /* Jamal, now you will not able to escape explaining
4183                  * me how you were going to use this. :-)
4184                  */
4185                 ret = NET_RX_DROP;
4186         }
4187
4188 out:
4189         return ret;
4190 }
4191
4192 static int __netif_receive_skb(struct sk_buff *skb)
4193 {
4194         int ret;
4195
4196         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4197                 unsigned long pflags = current->flags;
4198
4199                 /*
4200                  * PFMEMALLOC skbs are special, they should
4201                  * - be delivered to SOCK_MEMALLOC sockets only
4202                  * - stay away from userspace
4203                  * - have bounded memory usage
4204                  *
4205                  * Use PF_MEMALLOC as this saves us from propagating the allocation
4206                  * context down to all allocation sites.
4207                  */
4208                 current->flags |= PF_MEMALLOC;
4209                 ret = __netif_receive_skb_core(skb, true);
4210                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4211         } else
4212                 ret = __netif_receive_skb_core(skb, false);
4213
4214         return ret;
4215 }
4216
4217 static int netif_receive_skb_internal(struct sk_buff *skb)
4218 {
4219         int ret;
4220
4221         net_timestamp_check(netdev_tstamp_prequeue, skb);
4222
4223         if (skb_defer_rx_timestamp(skb))
4224                 return NET_RX_SUCCESS;
4225
4226         rcu_read_lock();
4227
4228 #ifdef CONFIG_RPS
4229         if (static_key_false(&rps_needed)) {
4230                 struct rps_dev_flow voidflow, *rflow = &voidflow;
4231                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4232
4233                 if (cpu >= 0) {
4234                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4235                         rcu_read_unlock();
4236                         return ret;
4237                 }
4238         }
4239 #endif
4240         ret = __netif_receive_skb(skb);
4241         rcu_read_unlock();
4242         return ret;
4243 }
4244
4245 /**
4246  *      netif_receive_skb - process receive buffer from network
4247  *      @skb: buffer to process
4248  *
4249  *      netif_receive_skb() is the main receive data processing function.
4250  *      It always succeeds. The buffer may be dropped during processing
4251  *      for congestion control or by the protocol layers.
4252  *
4253  *      This function may only be called from softirq context and interrupts
4254  *      should be enabled.
4255  *
4256  *      Return values (usually ignored):
4257  *      NET_RX_SUCCESS: no congestion
4258  *      NET_RX_DROP: packet was dropped
4259  */
4260 int netif_receive_skb(struct sk_buff *skb)
4261 {
4262         trace_netif_receive_skb_entry(skb);
4263
4264         return netif_receive_skb_internal(skb);
4265 }
4266 EXPORT_SYMBOL(netif_receive_skb);
4267
4268 DEFINE_PER_CPU(struct work_struct, flush_works);
4269
4270 /* Network device is going away, flush any packets still pending */
4271 static void flush_backlog(struct work_struct *work)
4272 {
4273         struct sk_buff *skb, *tmp;
4274         struct softnet_data *sd;
4275
4276         local_bh_disable();
4277         sd = this_cpu_ptr(&softnet_data);
4278
4279         local_irq_disable();
4280         rps_lock(sd);
4281         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4282                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4283                         __skb_unlink(skb, &sd->input_pkt_queue);
4284                         kfree_skb(skb);
4285                         input_queue_head_incr(sd);
4286                 }
4287         }
4288         rps_unlock(sd);
4289         local_irq_enable();
4290
4291         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4292                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4293                         __skb_unlink(skb, &sd->process_queue);
4294                         kfree_skb(skb);
4295                         input_queue_head_incr(sd);
4296                 }
4297         }
4298         local_bh_enable();
4299 }
4300
4301 static void flush_all_backlogs(void)
4302 {
4303         unsigned int cpu;
4304
4305         get_online_cpus();
4306
4307         for_each_online_cpu(cpu)
4308                 queue_work_on(cpu, system_highpri_wq,
4309                               per_cpu_ptr(&flush_works, cpu));
4310
4311         for_each_online_cpu(cpu)
4312                 flush_work(per_cpu_ptr(&flush_works, cpu));
4313
4314         put_online_cpus();
4315 }
4316
4317 static int napi_gro_complete(struct sk_buff *skb)
4318 {
4319         struct packet_offload *ptype;
4320         __be16 type = skb->protocol;
4321         struct list_head *head = &offload_base;
4322         int err = -ENOENT;
4323
4324         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4325
4326         if (NAPI_GRO_CB(skb)->count == 1) {
4327                 skb_shinfo(skb)->gso_size = 0;
4328                 goto out;
4329         }
4330
4331         rcu_read_lock();
4332         list_for_each_entry_rcu(ptype, head, list) {
4333                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4334                         continue;
4335
4336                 err = ptype->callbacks.gro_complete(skb, 0);
4337                 break;
4338         }
4339         rcu_read_unlock();
4340
4341         if (err) {
4342                 WARN_ON(&ptype->list == head);
4343                 kfree_skb(skb);
4344                 return NET_RX_SUCCESS;
4345         }
4346
4347 out:
4348         return netif_receive_skb_internal(skb);
4349 }
4350
4351 /* napi->gro_list contains packets ordered by age.
4352  * youngest packets at the head of it.
4353  * Complete skbs in reverse order to reduce latencies.
4354  */
4355 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4356 {
4357         struct sk_buff *skb, *prev = NULL;
4358
4359         /* scan list and build reverse chain */
4360         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4361                 skb->prev = prev;
4362                 prev = skb;
4363         }
4364
4365         for (skb = prev; skb; skb = prev) {
4366                 skb->next = NULL;
4367
4368                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4369                         return;
4370
4371                 prev = skb->prev;
4372                 napi_gro_complete(skb);
4373                 napi->gro_count--;
4374         }
4375
4376         napi->gro_list = NULL;
4377 }
4378 EXPORT_SYMBOL(napi_gro_flush);
4379
4380 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4381 {
4382         struct sk_buff *p;
4383         unsigned int maclen = skb->dev->hard_header_len;
4384         u32 hash = skb_get_hash_raw(skb);
4385
4386         for (p = napi->gro_list; p; p = p->next) {
4387                 unsigned long diffs;
4388
4389                 NAPI_GRO_CB(p)->flush = 0;
4390
4391                 if (hash != skb_get_hash_raw(p)) {
4392                         NAPI_GRO_CB(p)->same_flow = 0;
4393                         continue;
4394                 }
4395
4396                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4397                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4398                 diffs |= skb_metadata_dst_cmp(p, skb);
4399                 if (maclen == ETH_HLEN)
4400                         diffs |= compare_ether_header(skb_mac_header(p),
4401                                                       skb_mac_header(skb));
4402                 else if (!diffs)
4403                         diffs = memcmp(skb_mac_header(p),
4404                                        skb_mac_header(skb),
4405                                        maclen);
4406                 NAPI_GRO_CB(p)->same_flow = !diffs;
4407         }
4408 }
4409
4410 static void skb_gro_reset_offset(struct sk_buff *skb)
4411 {
4412         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4413         const skb_frag_t *frag0 = &pinfo->frags[0];
4414
4415         NAPI_GRO_CB(skb)->data_offset = 0;
4416         NAPI_GRO_CB(skb)->frag0 = NULL;
4417         NAPI_GRO_CB(skb)->frag0_len = 0;
4418
4419         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4420             pinfo->nr_frags &&
4421             !PageHighMem(skb_frag_page(frag0))) {
4422                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4423                 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4424                                                     skb_frag_size(frag0),
4425                                                     skb->end - skb->tail);
4426         }
4427 }
4428
4429 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4430 {
4431         struct skb_shared_info *pinfo = skb_shinfo(skb);
4432
4433         BUG_ON(skb->end - skb->tail < grow);
4434
4435         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4436
4437         skb->data_len -= grow;
4438         skb->tail += grow;
4439
4440         pinfo->frags[0].page_offset += grow;
4441         skb_frag_size_sub(&pinfo->frags[0], grow);
4442
4443         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4444                 skb_frag_unref(skb, 0);
4445                 memmove(pinfo->frags, pinfo->frags + 1,
4446                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4447         }
4448 }
4449
4450 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4451 {
4452         struct sk_buff **pp = NULL;
4453         struct packet_offload *ptype;
4454         __be16 type = skb->protocol;
4455         struct list_head *head = &offload_base;
4456         int same_flow;
4457         enum gro_result ret;
4458         int grow;
4459
4460         if (!(skb->dev->features & NETIF_F_GRO))
4461                 goto normal;
4462
4463         if (skb->csum_bad)
4464                 goto normal;
4465
4466         gro_list_prepare(napi, skb);
4467
4468         rcu_read_lock();
4469         list_for_each_entry_rcu(ptype, head, list) {
4470                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4471                         continue;
4472
4473                 skb_set_network_header(skb, skb_gro_offset(skb));
4474                 skb_reset_mac_len(skb);
4475                 NAPI_GRO_CB(skb)->same_flow = 0;
4476                 NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4477                 NAPI_GRO_CB(skb)->free = 0;
4478                 NAPI_GRO_CB(skb)->encap_mark = 0;
4479                 NAPI_GRO_CB(skb)->recursion_counter = 0;
4480                 NAPI_GRO_CB(skb)->is_fou = 0;
4481                 NAPI_GRO_CB(skb)->is_atomic = 1;
4482                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4483
4484                 /* Setup for GRO checksum validation */
4485                 switch (skb->ip_summed) {
4486                 case CHECKSUM_COMPLETE:
4487                         NAPI_GRO_CB(skb)->csum = skb->csum;
4488                         NAPI_GRO_CB(skb)->csum_valid = 1;
4489                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4490                         break;
4491                 case CHECKSUM_UNNECESSARY:
4492                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4493                         NAPI_GRO_CB(skb)->csum_valid = 0;
4494                         break;
4495                 default:
4496                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4497                         NAPI_GRO_CB(skb)->csum_valid = 0;
4498                 }
4499
4500                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4501                 break;
4502         }
4503         rcu_read_unlock();
4504
4505         if (&ptype->list == head)
4506                 goto normal;
4507
4508         same_flow = NAPI_GRO_CB(skb)->same_flow;
4509         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4510
4511         if (pp) {
4512                 struct sk_buff *nskb = *pp;
4513
4514                 *pp = nskb->next;
4515                 nskb->next = NULL;
4516                 napi_gro_complete(nskb);
4517                 napi->gro_count--;
4518         }
4519
4520         if (same_flow)
4521                 goto ok;
4522
4523         if (NAPI_GRO_CB(skb)->flush)
4524                 goto normal;
4525
4526         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4527                 struct sk_buff *nskb = napi->gro_list;
4528
4529                 /* locate the end of the list to select the 'oldest' flow */
4530                 while (nskb->next) {
4531                         pp = &nskb->next;
4532                         nskb = *pp;
4533                 }
4534                 *pp = NULL;
4535                 nskb->next = NULL;
4536                 napi_gro_complete(nskb);
4537         } else {
4538                 napi->gro_count++;
4539         }
4540         NAPI_GRO_CB(skb)->count = 1;
4541         NAPI_GRO_CB(skb)->age = jiffies;
4542         NAPI_GRO_CB(skb)->last = skb;
4543         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4544         skb->next = napi->gro_list;
4545         napi->gro_list = skb;
4546         ret = GRO_HELD;
4547
4548 pull:
4549         grow = skb_gro_offset(skb) - skb_headlen(skb);
4550         if (grow > 0)
4551                 gro_pull_from_frag0(skb, grow);
4552 ok:
4553         return ret;
4554
4555 normal:
4556         ret = GRO_NORMAL;
4557         goto pull;
4558 }
4559
4560 struct packet_offload *gro_find_receive_by_type(__be16 type)
4561 {
4562         struct list_head *offload_head = &offload_base;
4563         struct packet_offload *ptype;
4564
4565         list_for_each_entry_rcu(ptype, offload_head, list) {
4566                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4567                         continue;
4568                 return ptype;
4569         }
4570         return NULL;
4571 }
4572 EXPORT_SYMBOL(gro_find_receive_by_type);
4573
4574 struct packet_offload *gro_find_complete_by_type(__be16 type)
4575 {
4576         struct list_head *offload_head = &offload_base;
4577         struct packet_offload *ptype;
4578
4579         list_for_each_entry_rcu(ptype, offload_head, list) {
4580                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4581                         continue;
4582                 return ptype;
4583         }
4584         return NULL;
4585 }
4586 EXPORT_SYMBOL(gro_find_complete_by_type);
4587
4588 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4589 {
4590         switch (ret) {
4591         case GRO_NORMAL:
4592                 if (netif_receive_skb_internal(skb))
4593                         ret = GRO_DROP;
4594                 break;
4595
4596         case GRO_DROP:
4597                 kfree_skb(skb);
4598                 break;
4599
4600         case GRO_MERGED_FREE:
4601                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4602                         skb_dst_drop(skb);
4603                         secpath_reset(skb);
4604                         kmem_cache_free(skbuff_head_cache, skb);
4605                 } else {
4606                         __kfree_skb(skb);
4607                 }
4608                 break;
4609
4610         case GRO_HELD:
4611         case GRO_MERGED:
4612                 break;
4613         }
4614
4615         return ret;
4616 }
4617
4618 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4619 {
4620         skb_mark_napi_id(skb, napi);
4621         trace_napi_gro_receive_entry(skb);
4622
4623         skb_gro_reset_offset(skb);
4624
4625         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4626 }
4627 EXPORT_SYMBOL(napi_gro_receive);
4628
4629 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4630 {
4631         if (unlikely(skb->pfmemalloc)) {
4632                 consume_skb(skb);
4633                 return;
4634         }
4635         __skb_pull(skb, skb_headlen(skb));
4636         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4637         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4638         skb->vlan_tci = 0;
4639         skb->dev = napi->dev;
4640         skb->skb_iif = 0;
4641         skb->encapsulation = 0;
4642         skb_shinfo(skb)->gso_type = 0;
4643         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4644         secpath_reset(skb);
4645
4646         napi->skb = skb;
4647 }
4648
4649 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4650 {
4651         struct sk_buff *skb = napi->skb;
4652
4653         if (!skb) {
4654                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4655                 if (skb) {
4656                         napi->skb = skb;
4657                         skb_mark_napi_id(skb, napi);
4658                 }
4659         }
4660         return skb;
4661 }
4662 EXPORT_SYMBOL(napi_get_frags);
4663
4664 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4665                                       struct sk_buff *skb,
4666                                       gro_result_t ret)
4667 {
4668         switch (ret) {
4669         case GRO_NORMAL:
4670         case GRO_HELD:
4671                 __skb_push(skb, ETH_HLEN);
4672                 skb->protocol = eth_type_trans(skb, skb->dev);
4673                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4674                         ret = GRO_DROP;
4675                 break;
4676
4677         case GRO_DROP:
4678         case GRO_MERGED_FREE:
4679                 napi_reuse_skb(napi, skb);
4680                 break;
4681
4682         case GRO_MERGED:
4683                 break;
4684         }
4685
4686         return ret;
4687 }
4688
4689 /* Upper GRO stack assumes network header starts at gro_offset=0
4690  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4691  * We copy ethernet header into skb->data to have a common layout.
4692  */
4693 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4694 {
4695         struct sk_buff *skb = napi->skb;
4696         const struct ethhdr *eth;
4697         unsigned int hlen = sizeof(*eth);
4698
4699         napi->skb = NULL;
4700
4701         skb_reset_mac_header(skb);
4702         skb_gro_reset_offset(skb);
4703
4704         eth = skb_gro_header_fast(skb, 0);
4705         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4706                 eth = skb_gro_header_slow(skb, hlen, 0);
4707                 if (unlikely(!eth)) {
4708                         net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4709                                              __func__, napi->dev->name);
4710                         napi_reuse_skb(napi, skb);
4711                         return NULL;
4712                 }
4713         } else {
4714                 gro_pull_from_frag0(skb, hlen);
4715                 NAPI_GRO_CB(skb)->frag0 += hlen;
4716                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4717         }
4718         __skb_pull(skb, hlen);
4719
4720         /*
4721          * This works because the only protocols we care about don't require
4722          * special handling.
4723          * We'll fix it up properly in napi_frags_finish()
4724          */
4725         skb->protocol = eth->h_proto;
4726
4727         return skb;
4728 }
4729
4730 gro_result_t napi_gro_frags(struct napi_struct *napi)
4731 {
4732         struct sk_buff *skb = napi_frags_skb(napi);
4733
4734         if (!skb)
4735                 return GRO_DROP;
4736
4737         trace_napi_gro_frags_entry(skb);
4738
4739         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4740 }
4741 EXPORT_SYMBOL(napi_gro_frags);
4742
4743 /* Compute the checksum from gro_offset and return the folded value
4744  * after adding in any pseudo checksum.
4745  */
4746 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4747 {
4748         __wsum wsum;
4749         __sum16 sum;
4750
4751         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4752
4753         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4754         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4755         if (likely(!sum)) {
4756                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4757                     !skb->csum_complete_sw)
4758                         netdev_rx_csum_fault(skb->dev);
4759         }
4760
4761         NAPI_GRO_CB(skb)->csum = wsum;
4762         NAPI_GRO_CB(skb)->csum_valid = 1;
4763
4764         return sum;
4765 }
4766 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4767
4768 /*
4769  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4770  * Note: called with local irq disabled, but exits with local irq enabled.
4771  */
4772 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4773 {
4774 #ifdef CONFIG_RPS
4775         struct softnet_data *remsd = sd->rps_ipi_list;
4776
4777         if (remsd) {
4778                 sd->rps_ipi_list = NULL;
4779
4780                 local_irq_enable();
4781
4782                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4783                 while (remsd) {
4784                         struct softnet_data *next = remsd->rps_ipi_next;
4785
4786                         if (cpu_online(remsd->cpu))
4787                                 smp_call_function_single_async(remsd->cpu,
4788                                                            &remsd->csd);
4789                         remsd = next;
4790                 }
4791         } else
4792 #endif
4793                 local_irq_enable();
4794 }
4795
4796 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4797 {
4798 #ifdef CONFIG_RPS
4799         return sd->rps_ipi_list != NULL;
4800 #else
4801         return false;
4802 #endif
4803 }
4804
4805 static int process_backlog(struct napi_struct *napi, int quota)
4806 {
4807         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4808         bool again = true;
4809         int work = 0;
4810
4811         /* Check if we have pending ipi, its better to send them now,
4812          * not waiting net_rx_action() end.
4813          */
4814         if (sd_has_rps_ipi_waiting(sd)) {
4815                 local_irq_disable();
4816                 net_rps_action_and_irq_enable(sd);
4817         }
4818
4819         napi->weight = dev_rx_weight;
4820         while (again) {
4821                 struct sk_buff *skb;
4822
4823                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4824                         rcu_read_lock();
4825                         __netif_receive_skb(skb);
4826                         rcu_read_unlock();
4827                         input_queue_head_incr(sd);
4828                         if (++work >= quota)
4829                                 return work;
4830
4831                 }
4832
4833                 local_irq_disable();
4834                 rps_lock(sd);
4835                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4836                         /*
4837                          * Inline a custom version of __napi_complete().
4838                          * only current cpu owns and manipulates this napi,
4839                          * and NAPI_STATE_SCHED is the only possible flag set
4840                          * on backlog.
4841                          * We can use a plain write instead of clear_bit(),
4842                          * and we dont need an smp_mb() memory barrier.
4843                          */
4844                         napi->state = 0;
4845                         again = false;
4846                 } else {
4847                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4848                                                    &sd->process_queue);
4849                 }
4850                 rps_unlock(sd);
4851                 local_irq_enable();
4852         }
4853
4854         return work;
4855 }
4856
4857 /**
4858  * __napi_schedule - schedule for receive
4859  * @n: entry to schedule
4860  *
4861  * The entry's receive function will be scheduled to run.
4862  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4863  */
4864 void __napi_schedule(struct napi_struct *n)
4865 {
4866         unsigned long flags;
4867
4868         local_irq_save(flags);
4869         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4870         local_irq_restore(flags);
4871 }
4872 EXPORT_SYMBOL(__napi_schedule);
4873
4874 /**
4875  * __napi_schedule_irqoff - schedule for receive
4876  * @n: entry to schedule
4877  *
4878  * Variant of __napi_schedule() assuming hard irqs are masked
4879  */
4880 void __napi_schedule_irqoff(struct napi_struct *n)
4881 {
4882         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4883 }
4884 EXPORT_SYMBOL(__napi_schedule_irqoff);
4885
4886 bool napi_complete_done(struct napi_struct *n, int work_done)
4887 {
4888         unsigned long flags;
4889
4890         /*
4891          * 1) Don't let napi dequeue from the cpu poll list
4892          *    just in case its running on a different cpu.
4893          * 2) If we are busy polling, do nothing here, we have
4894          *    the guarantee we will be called later.
4895          */
4896         if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4897                                  NAPIF_STATE_IN_BUSY_POLL)))
4898                 return false;
4899
4900         if (n->gro_list) {
4901                 unsigned long timeout = 0;
4902
4903                 if (work_done)
4904                         timeout = n->dev->gro_flush_timeout;
4905
4906                 if (timeout)
4907                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4908                                       HRTIMER_MODE_REL_PINNED);
4909                 else
4910                         napi_gro_flush(n, false);
4911         }
4912         if (unlikely(!list_empty(&n->poll_list))) {
4913                 /* If n->poll_list is not empty, we need to mask irqs */
4914                 local_irq_save(flags);
4915                 list_del_init(&n->poll_list);
4916                 local_irq_restore(flags);
4917         }
4918         WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4919         return true;
4920 }
4921 EXPORT_SYMBOL(napi_complete_done);
4922
4923 /* must be called under rcu_read_lock(), as we dont take a reference */
4924 static struct napi_struct *napi_by_id(unsigned int napi_id)
4925 {
4926         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4927         struct napi_struct *napi;
4928
4929         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4930                 if (napi->napi_id == napi_id)
4931                         return napi;
4932
4933         return NULL;
4934 }
4935
4936 #if defined(CONFIG_NET_RX_BUSY_POLL)
4937
4938 #define BUSY_POLL_BUDGET 8
4939
4940 static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
4941 {
4942         int rc;
4943
4944         clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
4945
4946         local_bh_disable();
4947
4948         /* All we really want here is to re-enable device interrupts.
4949          * Ideally, a new ndo_busy_poll_stop() could avoid another round.
4950          */
4951         rc = napi->poll(napi, BUSY_POLL_BUDGET);
4952         netpoll_poll_unlock(have_poll_lock);
4953         if (rc == BUSY_POLL_BUDGET)
4954                 __napi_schedule(napi);
4955         local_bh_enable();
4956         if (local_softirq_pending())
4957                 do_softirq();
4958 }
4959
4960 bool sk_busy_loop(struct sock *sk, int nonblock)
4961 {
4962         unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4963         int (*napi_poll)(struct napi_struct *napi, int budget);
4964         void *have_poll_lock = NULL;
4965         struct napi_struct *napi;
4966         int rc;
4967
4968 restart:
4969         rc = false;
4970         napi_poll = NULL;
4971
4972         rcu_read_lock();
4973
4974         napi = napi_by_id(sk->sk_napi_id);
4975         if (!napi)
4976                 goto out;
4977
4978         preempt_disable();
4979         for (;;) {
4980                 rc = 0;
4981                 local_bh_disable();
4982                 if (!napi_poll) {
4983                         unsigned long val = READ_ONCE(napi->state);
4984
4985                         /* If multiple threads are competing for this napi,
4986                          * we avoid dirtying napi->state as much as we can.
4987                          */
4988                         if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
4989                                    NAPIF_STATE_IN_BUSY_POLL))
4990                                 goto count;
4991                         if (cmpxchg(&napi->state, val,
4992                                     val | NAPIF_STATE_IN_BUSY_POLL |
4993                                           NAPIF_STATE_SCHED) != val)
4994                                 goto count;
4995                         have_poll_lock = netpoll_poll_lock(napi);
4996                         napi_poll = napi->poll;
4997                 }
4998                 rc = napi_poll(napi, BUSY_POLL_BUDGET);
4999                 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5000 count:
5001                 if (rc > 0)
5002                         __NET_ADD_STATS(sock_net(sk),
5003                                         LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5004                 local_bh_enable();
5005
5006                 if (rc == LL_FLUSH_FAILED)
5007                         break; /* permanent failure */
5008
5009                 if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5010                     busy_loop_timeout(end_time))
5011                         break;
5012
5013                 if (unlikely(need_resched())) {
5014                         if (napi_poll)
5015                                 busy_poll_stop(napi, have_poll_lock);
5016                         preempt_enable();
5017                         rcu_read_unlock();
5018                         cond_resched();
5019                         rc = !skb_queue_empty(&sk->sk_receive_queue);
5020                         if (rc || busy_loop_timeout(end_time))
5021                                 return rc;
5022                         goto restart;
5023                 }
5024                 cpu_relax();
5025         }
5026         if (napi_poll)
5027                 busy_poll_stop(napi, have_poll_lock);
5028         preempt_enable();
5029         rc = !skb_queue_empty(&sk->sk_receive_queue);
5030 out:
5031         rcu_read_unlock();
5032         return rc;
5033 }
5034 EXPORT_SYMBOL(sk_busy_loop);
5035
5036 #endif /* CONFIG_NET_RX_BUSY_POLL */
5037
5038 static void napi_hash_add(struct napi_struct *napi)
5039 {
5040         if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5041             test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5042                 return;
5043
5044         spin_lock(&napi_hash_lock);
5045
5046         /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5047         do {
5048                 if (unlikely(++napi_gen_id < NR_CPUS + 1))
5049                         napi_gen_id = NR_CPUS + 1;
5050         } while (napi_by_id(napi_gen_id));
5051         napi->napi_id = napi_gen_id;
5052
5053         hlist_add_head_rcu(&napi->napi_hash_node,
5054                            &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5055
5056         spin_unlock(&napi_hash_lock);
5057 }
5058
5059 /* Warning : caller is responsible to make sure rcu grace period
5060  * is respected before freeing memory containing @napi
5061  */
5062 bool napi_hash_del(struct napi_struct *napi)
5063 {
5064         bool rcu_sync_needed = false;
5065
5066         spin_lock(&napi_hash_lock);
5067
5068         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5069                 rcu_sync_needed = true;
5070                 hlist_del_rcu(&napi->napi_hash_node);
5071         }
5072         spin_unlock(&napi_hash_lock);
5073         return rcu_sync_needed;
5074 }
5075 EXPORT_SYMBOL_GPL(napi_hash_del);
5076
5077 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5078 {
5079         struct napi_struct *napi;
5080
5081         napi = container_of(timer, struct napi_struct, timer);
5082         if (napi->gro_list)
5083                 napi_schedule(napi);
5084
5085         return HRTIMER_NORESTART;
5086 }
5087
5088 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5089                     int (*poll)(struct napi_struct *, int), int weight)
5090 {
5091         INIT_LIST_HEAD(&napi->poll_list);
5092         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5093         napi->timer.function = napi_watchdog;
5094         napi->gro_count = 0;
5095         napi->gro_list = NULL;
5096         napi->skb = NULL;
5097         napi->poll = poll;
5098         if (weight > NAPI_POLL_WEIGHT)
5099                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5100                             weight, dev->name);
5101         napi->weight = weight;
5102         list_add(&napi->dev_list, &dev->napi_list);
5103         napi->dev = dev;
5104 #ifdef CONFIG_NETPOLL
5105         napi->poll_owner = -1;
5106 #endif
5107         set_bit(NAPI_STATE_SCHED, &napi->state);
5108         napi_hash_add(napi);
5109 }
5110 EXPORT_SYMBOL(netif_napi_add);
5111
5112 void napi_disable(struct napi_struct *n)
5113 {
5114         might_sleep();
5115         set_bit(NAPI_STATE_DISABLE, &n->state);
5116
5117         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5118                 msleep(1);
5119         while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5120                 msleep(1);
5121
5122         hrtimer_cancel(&n->timer);
5123
5124         clear_bit(NAPI_STATE_DISABLE, &n->state);
5125 }
5126 EXPORT_SYMBOL(napi_disable);
5127
5128 /* Must be called in process context */
5129 void netif_napi_del(struct napi_struct *napi)
5130 {
5131         might_sleep();
5132         if (napi_hash_del(napi))
5133                 synchronize_net();
5134         list_del_init(&napi->dev_list);
5135         napi_free_frags(napi);
5136
5137         kfree_skb_list(napi->gro_list);
5138         napi->gro_list = NULL;
5139         napi->gro_count = 0;
5140 }
5141 EXPORT_SYMBOL(netif_napi_del);
5142
5143 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5144 {
5145         void *have;
5146         int work, weight;
5147
5148         list_del_init(&n->poll_list);
5149
5150         have = netpoll_poll_lock(n);
5151
5152         weight = n->weight;
5153
5154         /* This NAPI_STATE_SCHED test is for avoiding a race
5155          * with netpoll's poll_napi().  Only the entity which
5156          * obtains the lock and sees NAPI_STATE_SCHED set will
5157          * actually make the ->poll() call.  Therefore we avoid
5158          * accidentally calling ->poll() when NAPI is not scheduled.
5159          */
5160         work = 0;
5161         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5162                 work = n->poll(n, weight);
5163                 trace_napi_poll(n, work, weight);
5164         }
5165
5166         WARN_ON_ONCE(work > weight);
5167
5168         if (likely(work < weight))
5169                 goto out_unlock;
5170
5171         /* Drivers must not modify the NAPI state if they
5172          * consume the entire weight.  In such cases this code
5173          * still "owns" the NAPI instance and therefore can
5174          * move the instance around on the list at-will.
5175          */
5176         if (unlikely(napi_disable_pending(n))) {
5177                 napi_complete(n);
5178                 goto out_unlock;
5179         }
5180
5181         if (n->gro_list) {
5182                 /* flush too old packets
5183                  * If HZ < 1000, flush all packets.
5184                  */
5185                 napi_gro_flush(n, HZ >= 1000);
5186         }
5187
5188         /* Some drivers may have called napi_schedule
5189          * prior to exhausting their budget.
5190          */
5191         if (unlikely(!list_empty(&n->poll_list))) {
5192                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5193                              n->dev ? n->dev->name : "backlog");
5194                 goto out_unlock;
5195         }
5196
5197         list_add_tail(&n->poll_list, repoll);
5198
5199 out_unlock:
5200         netpoll_poll_unlock(have);
5201
5202         return work;
5203 }
5204
5205 static __latent_entropy void net_rx_action(struct softirq_action *h)
5206 {
5207         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5208         unsigned long time_limit = jiffies + 2;
5209         int budget = netdev_budget;
5210         LIST_HEAD(list);
5211         LIST_HEAD(repoll);
5212
5213         local_irq_disable();
5214         list_splice_init(&sd->poll_list, &list);
5215         local_irq_enable();
5216
5217         for (;;) {
5218                 struct napi_struct *n;
5219
5220                 if (list_empty(&list)) {
5221                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5222                                 goto out;
5223                         break;
5224                 }
5225
5226                 n = list_first_entry(&list, struct napi_struct, poll_list);
5227                 budget -= napi_poll(n, &repoll);
5228
5229                 /* If softirq window is exhausted then punt.
5230                  * Allow this to run for 2 jiffies since which will allow
5231                  * an average latency of 1.5/HZ.
5232                  */
5233                 if (unlikely(budget <= 0 ||
5234                              time_after_eq(jiffies, time_limit))) {
5235                         sd->time_squeeze++;
5236                         break;
5237                 }
5238         }
5239
5240         local_irq_disable();
5241
5242         list_splice_tail_init(&sd->poll_list, &list);
5243         list_splice_tail(&repoll, &list);
5244         list_splice(&list, &sd->poll_list);
5245         if (!list_empty(&sd->poll_list))
5246                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5247
5248         net_rps_action_and_irq_enable(sd);
5249 out:
5250         __kfree_skb_flush();
5251 }
5252
5253 struct netdev_adjacent {
5254         struct net_device *dev;
5255
5256         /* upper master flag, there can only be one master device per list */
5257         bool master;
5258
5259         /* counter for the number of times this device was added to us */
5260         u16 ref_nr;
5261
5262         /* private field for the users */
5263         void *private;
5264
5265         struct list_head list;
5266         struct rcu_head rcu;
5267 };
5268
5269 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5270                                                  struct list_head *adj_list)
5271 {
5272         struct netdev_adjacent *adj;
5273
5274         list_for_each_entry(adj, adj_list, list) {
5275                 if (adj->dev == adj_dev)
5276                         return adj;
5277         }
5278         return NULL;
5279 }
5280
5281 static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5282 {
5283         struct net_device *dev = data;
5284
5285         return upper_dev == dev;
5286 }
5287
5288 /**
5289  * netdev_has_upper_dev - Check if device is linked to an upper device
5290  * @dev: device
5291  * @upper_dev: upper device to check
5292  *
5293  * Find out if a device is linked to specified upper device and return true
5294  * in case it is. Note that this checks only immediate upper device,
5295  * not through a complete stack of devices. The caller must hold the RTNL lock.
5296  */
5297 bool netdev_has_upper_dev(struct net_device *dev,
5298                           struct net_device *upper_dev)
5299 {
5300         ASSERT_RTNL();
5301
5302         return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5303                                              upper_dev);
5304 }
5305 EXPORT_SYMBOL(netdev_has_upper_dev);
5306
5307 /**
5308  * netdev_has_upper_dev_all - Check if device is linked to an upper device
5309  * @dev: device
5310  * @upper_dev: upper device to check
5311  *
5312  * Find out if a device is linked to specified upper device and return true
5313  * in case it is. Note that this checks the entire upper device chain.
5314  * The caller must hold rcu lock.
5315  */
5316
5317 bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5318                                   struct net_device *upper_dev)
5319 {
5320         return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5321                                                upper_dev);
5322 }
5323 EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5324
5325 /**
5326  * netdev_has_any_upper_dev - Check if device is linked to some device
5327  * @dev: device
5328  *
5329  * Find out if a device is linked to an upper device and return true in case
5330  * it is. The caller must hold the RTNL lock.
5331  */
5332 static bool netdev_has_any_upper_dev(struct net_device *dev)
5333 {
5334         ASSERT_RTNL();
5335
5336         return !list_empty(&dev->adj_list.upper);
5337 }
5338
5339 /**
5340  * netdev_master_upper_dev_get - Get master upper device
5341  * @dev: device
5342  *
5343  * Find a master upper device and return pointer to it or NULL in case
5344  * it's not there. The caller must hold the RTNL lock.
5345  */
5346 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5347 {
5348         struct netdev_adjacent *upper;
5349
5350         ASSERT_RTNL();
5351
5352         if (list_empty(&dev->adj_list.upper))
5353                 return NULL;
5354
5355         upper = list_first_entry(&dev->adj_list.upper,
5356                                  struct netdev_adjacent, list);
5357         if (likely(upper->master))
5358                 return upper->dev;
5359         return NULL;
5360 }
5361 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5362
5363 /**
5364  * netdev_has_any_lower_dev - Check if device is linked to some device
5365  * @dev: device
5366  *
5367  * Find out if a device is linked to a lower device and return true in case
5368  * it is. The caller must hold the RTNL lock.
5369  */
5370 static bool netdev_has_any_lower_dev(struct net_device *dev)
5371 {
5372         ASSERT_RTNL();
5373
5374         return !list_empty(&dev->adj_list.lower);
5375 }
5376
5377 void *netdev_adjacent_get_private(struct list_head *adj_list)
5378 {
5379         struct netdev_adjacent *adj;
5380
5381         adj = list_entry(adj_list, struct netdev_adjacent, list);
5382
5383         return adj->private;
5384 }
5385 EXPORT_SYMBOL(netdev_adjacent_get_private);
5386
5387 /**
5388  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5389  * @dev: device
5390  * @iter: list_head ** of the current position
5391  *
5392  * Gets the next device from the dev's upper list, starting from iter
5393  * position. The caller must hold RCU read lock.
5394  */
5395 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5396                                                  struct list_head **iter)
5397 {
5398         struct netdev_adjacent *upper;
5399
5400         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5401
5402         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5403
5404         if (&upper->list == &dev->adj_list.upper)
5405                 return NULL;
5406
5407         *iter = &upper->list;
5408
5409         return upper->dev;
5410 }
5411 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5412
5413 static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5414                                                     struct list_head **iter)
5415 {
5416         struct netdev_adjacent *upper;
5417
5418         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5419
5420         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5421
5422         if (&upper->list == &dev->adj_list.upper)
5423                 return NULL;
5424
5425         *iter = &upper->list;
5426
5427         return upper->dev;
5428 }
5429
5430 int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5431                                   int (*fn)(struct net_device *dev,
5432                                             void *data),
5433                                   void *data)
5434 {
5435         struct net_device *udev;
5436         struct list_head *iter;
5437         int ret;
5438
5439         for (iter = &dev->adj_list.upper,
5440              udev = netdev_next_upper_dev_rcu(dev, &iter);
5441              udev;
5442              udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5443                 /* first is the upper device itself */
5444                 ret = fn(udev, data);
5445                 if (ret)
5446                         return ret;
5447
5448                 /* then look at all of its upper devices */
5449                 ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5450                 if (ret)
5451                         return ret;
5452         }
5453
5454         return 0;
5455 }
5456 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5457
5458 /**
5459  * netdev_lower_get_next_private - Get the next ->private from the
5460  *                                 lower neighbour list
5461  * @dev: device
5462  * @iter: list_head ** of the current position
5463  *
5464  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5465  * list, starting from iter position. The caller must hold either hold the
5466  * RTNL lock or its own locking that guarantees that the neighbour lower
5467  * list will remain unchanged.
5468  */
5469 void *netdev_lower_get_next_private(struct net_device *dev,
5470                                     struct list_head **iter)
5471 {
5472         struct netdev_adjacent *lower;
5473
5474         lower = list_entry(*iter, struct netdev_adjacent, list);
5475
5476         if (&lower->list == &dev->adj_list.lower)
5477                 return NULL;
5478
5479         *iter = lower->list.next;
5480
5481         return lower->private;
5482 }
5483 EXPORT_SYMBOL(netdev_lower_get_next_private);
5484
5485 /**
5486  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5487  *                                     lower neighbour list, RCU
5488  *                                     variant
5489  * @dev: device
5490  * @iter: list_head ** of the current position
5491  *
5492  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5493  * list, starting from iter position. The caller must hold RCU read lock.
5494  */
5495 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5496                                         struct list_head **iter)
5497 {
5498         struct netdev_adjacent *lower;
5499
5500         WARN_ON_ONCE(!rcu_read_lock_held());
5501
5502         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5503
5504         if (&lower->list == &dev->adj_list.lower)
5505                 return NULL;
5506
5507         *iter = &lower->list;
5508
5509         return lower->private;
5510 }
5511 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5512
5513 /**
5514  * netdev_lower_get_next - Get the next device from the lower neighbour
5515  *                         list
5516  * @dev: device
5517  * @iter: list_head ** of the current position
5518  *
5519  * Gets the next netdev_adjacent from the dev's lower neighbour
5520  * list, starting from iter position. The caller must hold RTNL lock or
5521  * its own locking that guarantees that the neighbour lower
5522  * list will remain unchanged.
5523  */
5524 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5525 {
5526         struct netdev_adjacent *lower;
5527
5528         lower = list_entry(*iter, struct netdev_adjacent, list);
5529
5530         if (&lower->list == &dev->adj_list.lower)
5531                 return NULL;
5532
5533         *iter = lower->list.next;
5534
5535         return lower->dev;
5536 }
5537 EXPORT_SYMBOL(netdev_lower_get_next);
5538
5539 static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5540                                                 struct list_head **iter)
5541 {
5542         struct netdev_adjacent *lower;
5543
5544         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5545
5546         if (&lower->list == &dev->adj_list.lower)
5547                 return NULL;
5548
5549         *iter = &lower->list;
5550
5551         return lower->dev;
5552 }
5553
5554 int netdev_walk_all_lower_dev(struct net_device *dev,
5555                               int (*fn)(struct net_device *dev,
5556                                         void *data),
5557                               void *data)
5558 {
5559         struct net_device *ldev;
5560         struct list_head *iter;
5561         int ret;
5562
5563         for (iter = &dev->adj_list.lower,
5564              ldev = netdev_next_lower_dev(dev, &iter);
5565              ldev;
5566              ldev = netdev_next_lower_dev(dev, &iter)) {
5567                 /* first is the lower device itself */
5568                 ret = fn(ldev, data);
5569                 if (ret)
5570                         return ret;
5571
5572                 /* then look at all of its lower devices */
5573                 ret = netdev_walk_all_lower_dev(ldev, fn, data);
5574                 if (ret)
5575                         return ret;
5576         }
5577
5578         return 0;
5579 }
5580 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5581
5582 static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5583                                                     struct list_head **iter)
5584 {
5585         struct netdev_adjacent *lower;
5586
5587         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5588         if (&lower->list == &dev->adj_list.lower)
5589                 return NULL;
5590
5591         *iter = &lower->list;
5592
5593         return lower->dev;
5594 }
5595
5596 int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5597                                   int (*fn)(struct net_device *dev,
5598                                             void *data),
5599                                   void *data)
5600 {
5601         struct net_device *ldev;
5602         struct list_head *iter;
5603         int ret;
5604
5605         for (iter = &dev->adj_list.lower,
5606              ldev = netdev_next_lower_dev_rcu(dev, &iter);
5607              ldev;
5608              ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
5609                 /* first is the lower device itself */
5610                 ret = fn(ldev, data);
5611                 if (ret)
5612                         return ret;
5613
5614                 /* then look at all of its lower devices */
5615                 ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
5616                 if (ret)
5617                         return ret;
5618         }
5619
5620         return 0;
5621 }
5622 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
5623
5624 /**
5625  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5626  *                                     lower neighbour list, RCU
5627  *                                     variant
5628  * @dev: device
5629  *
5630  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5631  * list. The caller must hold RCU read lock.
5632  */
5633 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5634 {
5635         struct netdev_adjacent *lower;
5636
5637         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5638                         struct netdev_adjacent, list);
5639         if (lower)
5640                 return lower->private;
5641         return NULL;
5642 }
5643 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5644
5645 /**
5646  * netdev_master_upper_dev_get_rcu - Get master upper device
5647  * @dev: device
5648  *
5649  * Find a master upper device and return pointer to it or NULL in case
5650  * it's not there. The caller must hold the RCU read lock.
5651  */
5652 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5653 {
5654         struct netdev_adjacent *upper;
5655
5656         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5657                                        struct netdev_adjacent, list);
5658         if (upper && likely(upper->master))
5659                 return upper->dev;
5660         return NULL;
5661 }
5662 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5663
5664 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5665                               struct net_device *adj_dev,
5666                               struct list_head *dev_list)
5667 {
5668         char linkname[IFNAMSIZ+7];
5669         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5670                 "upper_%s" : "lower_%s", adj_dev->name);
5671         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5672                                  linkname);
5673 }
5674 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5675                                char *name,
5676                                struct list_head *dev_list)
5677 {
5678         char linkname[IFNAMSIZ+7];
5679         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5680                 "upper_%s" : "lower_%s", name);
5681         sysfs_remove_link(&(dev->dev.kobj), linkname);
5682 }
5683
5684 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5685                                                  struct net_device *adj_dev,
5686                                                  struct list_head *dev_list)
5687 {
5688         return (dev_list == &dev->adj_list.upper ||
5689                 dev_list == &dev->adj_list.lower) &&
5690                 net_eq(dev_net(dev), dev_net(adj_dev));
5691 }
5692
5693 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5694                                         struct net_device *adj_dev,
5695                                         struct list_head *dev_list,
5696                                         void *private, bool master)
5697 {
5698         struct netdev_adjacent *adj;
5699         int ret;
5700
5701         adj = __netdev_find_adj(adj_dev, dev_list);
5702
5703         if (adj) {
5704                 adj->ref_nr += 1;
5705                 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5706                          dev->name, adj_dev->name, adj->ref_nr);
5707
5708                 return 0;
5709         }
5710
5711         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5712         if (!adj)
5713                 return -ENOMEM;
5714
5715         adj->dev = adj_dev;
5716         adj->master = master;
5717         adj->ref_nr = 1;
5718         adj->private = private;
5719         dev_hold(adj_dev);
5720
5721         pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5722                  dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
5723
5724         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5725                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5726                 if (ret)
5727                         goto free_adj;
5728         }
5729
5730         /* Ensure that master link is always the first item in list. */
5731         if (master) {
5732                 ret = sysfs_create_link(&(dev->dev.kobj),
5733                                         &(adj_dev->dev.kobj), "master");
5734                 if (ret)
5735                         goto remove_symlinks;
5736
5737                 list_add_rcu(&adj->list, dev_list);
5738         } else {
5739                 list_add_tail_rcu(&adj->list, dev_list);
5740         }
5741
5742         return 0;
5743
5744 remove_symlinks:
5745         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5746                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5747 free_adj:
5748         kfree(adj);
5749         dev_put(adj_dev);
5750
5751         return ret;
5752 }
5753
5754 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5755                                          struct net_device *adj_dev,
5756                                          u16 ref_nr,
5757                                          struct list_head *dev_list)
5758 {
5759         struct netdev_adjacent *adj;
5760
5761         pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5762                  dev->name, adj_dev->name, ref_nr);
5763
5764         adj = __netdev_find_adj(adj_dev, dev_list);
5765
5766         if (!adj) {
5767                 pr_err("Adjacency does not exist for device %s from %s\n",
5768                        dev->name, adj_dev->name);
5769                 WARN_ON(1);
5770                 return;
5771         }
5772
5773         if (adj->ref_nr > ref_nr) {
5774                 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5775                          dev->name, adj_dev->name, ref_nr,
5776                          adj->ref_nr - ref_nr);
5777                 adj->ref_nr -= ref_nr;
5778                 return;
5779         }
5780
5781         if (adj->master)
5782                 sysfs_remove_link(&(dev->dev.kobj), "master");
5783
5784         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5785                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5786
5787         list_del_rcu(&adj->list);
5788         pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
5789                  adj_dev->name, dev->name, adj_dev->name);
5790         dev_put(adj_dev);
5791         kfree_rcu(adj, rcu);
5792 }
5793
5794 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5795                                             struct net_device *upper_dev,
5796                                             struct list_head *up_list,
5797                                             struct list_head *down_list,
5798                                             void *private, bool master)
5799 {
5800         int ret;
5801
5802         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
5803                                            private, master);
5804         if (ret)
5805                 return ret;
5806
5807         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
5808                                            private, false);
5809         if (ret) {
5810                 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
5811                 return ret;
5812         }
5813
5814         return 0;
5815 }
5816
5817 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5818                                                struct net_device *upper_dev,
5819                                                u16 ref_nr,
5820                                                struct list_head *up_list,
5821                                                struct list_head *down_list)
5822 {
5823         __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5824         __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5825 }
5826
5827 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5828                                                 struct net_device *upper_dev,
5829                                                 void *private, bool master)
5830 {
5831         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5832                                                 &dev->adj_list.upper,
5833                                                 &upper_dev->adj_list.lower,
5834                                                 private, master);
5835 }
5836
5837 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5838                                                    struct net_device *upper_dev)
5839 {
5840         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5841                                            &dev->adj_list.upper,
5842                                            &upper_dev->adj_list.lower);
5843 }
5844
5845 static int __netdev_upper_dev_link(struct net_device *dev,
5846                                    struct net_device *upper_dev, bool master,
5847                                    void *upper_priv, void *upper_info)
5848 {
5849         struct netdev_notifier_changeupper_info changeupper_info;
5850         int ret = 0;
5851
5852         ASSERT_RTNL();
5853
5854         if (dev == upper_dev)
5855                 return -EBUSY;
5856
5857         /* To prevent loops, check if dev is not upper device to upper_dev. */
5858         if (netdev_has_upper_dev(upper_dev, dev))
5859                 return -EBUSY;
5860
5861         if (netdev_has_upper_dev(dev, upper_dev))
5862                 return -EEXIST;
5863
5864         if (master && netdev_master_upper_dev_get(dev))
5865                 return -EBUSY;
5866
5867         changeupper_info.upper_dev = upper_dev;
5868         changeupper_info.master = master;
5869         changeupper_info.linking = true;
5870         changeupper_info.upper_info = upper_info;
5871
5872         ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5873                                             &changeupper_info.info);
5874         ret = notifier_to_errno(ret);
5875         if (ret)
5876                 return ret;
5877
5878         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5879                                                    master);
5880         if (ret)
5881                 return ret;
5882
5883         ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5884                                             &changeupper_info.info);
5885         ret = notifier_to_errno(ret);
5886         if (ret)
5887                 goto rollback;
5888
5889         return 0;
5890
5891 rollback:
5892         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5893
5894         return ret;
5895 }
5896
5897 /**
5898  * netdev_upper_dev_link - Add a link to the upper device
5899  * @dev: device
5900  * @upper_dev: new upper device
5901  *
5902  * Adds a link to device which is upper to this one. The caller must hold
5903  * the RTNL lock. On a failure a negative errno code is returned.
5904  * On success the reference counts are adjusted and the function
5905  * returns zero.
5906  */
5907 int netdev_upper_dev_link(struct net_device *dev,
5908                           struct net_device *upper_dev)
5909 {
5910         return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5911 }
5912 EXPORT_SYMBOL(netdev_upper_dev_link);
5913
5914 /**
5915  * netdev_master_upper_dev_link - Add a master link to the upper device
5916  * @dev: device
5917  * @upper_dev: new upper device
5918  * @upper_priv: upper device private
5919  * @upper_info: upper info to be passed down via notifier
5920  *
5921  * Adds a link to device which is upper to this one. In this case, only
5922  * one master upper device can be linked, although other non-master devices
5923  * might be linked as well. The caller must hold the RTNL lock.
5924  * On a failure a negative errno code is returned. On success the reference
5925  * counts are adjusted and the function returns zero.
5926  */
5927 int netdev_master_upper_dev_link(struct net_device *dev,
5928                                  struct net_device *upper_dev,
5929                                  void *upper_priv, void *upper_info)
5930 {
5931         return __netdev_upper_dev_link(dev, upper_dev, true,
5932                                        upper_priv, upper_info);
5933 }
5934 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5935
5936 /**
5937  * netdev_upper_dev_unlink - Removes a link to upper device
5938  * @dev: device
5939  * @upper_dev: new upper device
5940  *
5941  * Removes a link to device which is upper to this one. The caller must hold
5942  * the RTNL lock.
5943  */
5944 void netdev_upper_dev_unlink(struct net_device *dev,
5945                              struct net_device *upper_dev)
5946 {
5947         struct netdev_notifier_changeupper_info changeupper_info;
5948         ASSERT_RTNL();
5949
5950         changeupper_info.upper_dev = upper_dev;
5951         changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5952         changeupper_info.linking = false;
5953
5954         call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5955                                       &changeupper_info.info);
5956
5957         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5958
5959         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5960                                       &changeupper_info.info);
5961 }
5962 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5963
5964 /**
5965  * netdev_bonding_info_change - Dispatch event about slave change
5966  * @dev: device
5967  * @bonding_info: info to dispatch
5968  *
5969  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5970  * The caller must hold the RTNL lock.
5971  */
5972 void netdev_bonding_info_change(struct net_device *dev,
5973                                 struct netdev_bonding_info *bonding_info)
5974 {
5975         struct netdev_notifier_bonding_info     info;
5976
5977         memcpy(&info.bonding_info, bonding_info,
5978                sizeof(struct netdev_bonding_info));
5979         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5980                                       &info.info);
5981 }
5982 EXPORT_SYMBOL(netdev_bonding_info_change);
5983
5984 static void netdev_adjacent_add_links(struct net_device *dev)
5985 {
5986         struct netdev_adjacent *iter;
5987
5988         struct net *net = dev_net(dev);
5989
5990         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5991                 if (!net_eq(net, dev_net(iter->dev)))
5992                         continue;
5993                 netdev_adjacent_sysfs_add(iter->dev, dev,
5994                                           &iter->dev->adj_list.lower);
5995                 netdev_adjacent_sysfs_add(dev, iter->dev,
5996                                           &dev->adj_list.upper);
5997         }
5998
5999         list_for_each_entry(iter, &dev->adj_list.lower, list) {
6000                 if (!net_eq(net, dev_net(iter->dev)))
6001                         continue;
6002                 netdev_adjacent_sysfs_add(iter->dev, dev,
6003                                           &iter->dev->adj_list.upper);
6004                 netdev_adjacent_sysfs_add(dev, iter->dev,
6005                                           &dev->adj_list.lower);
6006         }
6007 }
6008
6009 static void netdev_adjacent_del_links(struct net_device *dev)
6010 {
6011         struct netdev_adjacent *iter;
6012
6013         struct net *net = dev_net(dev);
6014
6015         list_for_each_entry(iter, &dev->adj_list.upper, list) {
6016                 if (!net_eq(net, dev_net(iter->dev)))
6017                         continue;
6018                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6019                                           &iter->dev->adj_list.lower);
6020                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6021                                           &dev->adj_list.upper);
6022         }
6023
6024         list_for_each_entry(iter, &dev->adj_list.lower, list) {
6025                 if (!net_eq(net, dev_net(iter->dev)))
6026                         continue;
6027                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6028                                           &iter->dev->adj_list.upper);
6029                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6030                                           &dev->adj_list.lower);
6031         }
6032 }
6033
6034 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6035 {
6036         struct netdev_adjacent *iter;
6037
6038         struct net *net = dev_net(dev);
6039
6040         list_for_each_entry(iter, &dev->adj_list.upper, list) {
6041                 if (!net_eq(net, dev_net(iter->dev)))
6042                         continue;
6043                 netdev_adjacent_sysfs_del(iter->dev, oldname,
6044                                           &iter->dev->adj_list.lower);
6045                 netdev_adjacent_sysfs_add(iter->dev, dev,
6046                                           &iter->dev->adj_list.lower);
6047         }
6048
6049         list_for_each_entry(iter, &dev->adj_list.lower, list) {
6050                 if (!net_eq(net, dev_net(iter->dev)))
6051                         continue;
6052                 netdev_adjacent_sysfs_del(iter->dev, oldname,
6053                                           &iter->dev->adj_list.upper);
6054                 netdev_adjacent_sysfs_add(iter->dev, dev,
6055                                           &iter->dev->adj_list.upper);
6056         }
6057 }
6058
6059 void *netdev_lower_dev_get_private(struct net_device *dev,
6060                                    struct net_device *lower_dev)
6061 {
6062         struct netdev_adjacent *lower;
6063
6064         if (!lower_dev)
6065                 return NULL;
6066         lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6067         if (!lower)
6068                 return NULL;
6069
6070         return lower->private;
6071 }
6072 EXPORT_SYMBOL(netdev_lower_dev_get_private);
6073
6074
6075 int dev_get_nest_level(struct net_device *dev)
6076 {
6077         struct net_device *lower = NULL;
6078         struct list_head *iter;
6079         int max_nest = -1;
6080         int nest;
6081
6082         ASSERT_RTNL();
6083
6084         netdev_for_each_lower_dev(dev, lower, iter) {
6085                 nest = dev_get_nest_level(lower);
6086                 if (max_nest < nest)
6087                         max_nest = nest;
6088         }
6089
6090         return max_nest + 1;
6091 }
6092 EXPORT_SYMBOL(dev_get_nest_level);
6093
6094 /**
6095  * netdev_lower_change - Dispatch event about lower device state change
6096  * @lower_dev: device
6097  * @lower_state_info: state to dispatch
6098  *
6099  * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6100  * The caller must hold the RTNL lock.
6101  */
6102 void netdev_lower_state_changed(struct net_device *lower_dev,
6103                                 void *lower_state_info)
6104 {
6105         struct netdev_notifier_changelowerstate_info changelowerstate_info;
6106
6107         ASSERT_RTNL();
6108         changelowerstate_info.lower_state_info = lower_state_info;
6109         call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6110                                       &changelowerstate_info.info);
6111 }
6112 EXPORT_SYMBOL(netdev_lower_state_changed);
6113
6114 static void dev_change_rx_flags(struct net_device *dev, int flags)
6115 {
6116         const struct net_device_ops *ops = dev->netdev_ops;
6117
6118         if (ops->ndo_change_rx_flags)
6119                 ops->ndo_change_rx_flags(dev, flags);
6120 }
6121
6122 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6123 {
6124         unsigned int old_flags = dev->flags;
6125         kuid_t uid;
6126         kgid_t gid;
6127
6128         ASSERT_RTNL();
6129
6130         dev->flags |= IFF_PROMISC;
6131         dev->promiscuity += inc;
6132         if (dev->promiscuity == 0) {
6133                 /*
6134                  * Avoid overflow.
6135                  * If inc causes overflow, untouch promisc and return error.
6136                  */
6137                 if (inc < 0)
6138                         dev->flags &= ~IFF_PROMISC;
6139                 else {
6140                         dev->promiscuity -= inc;
6141                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6142                                 dev->name);
6143                         return -EOVERFLOW;
6144                 }
6145         }
6146         if (dev->flags != old_flags) {
6147                 pr_info("device %s %s promiscuous mode\n",
6148                         dev->name,
6149                         dev->flags & IFF_PROMISC ? "entered" : "left");
6150                 if (audit_enabled) {
6151                         current_uid_gid(&uid, &gid);
6152                         audit_log(current->audit_context, GFP_ATOMIC,
6153                                 AUDIT_ANOM_PROMISCUOUS,
6154                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6155                                 dev->name, (dev->flags & IFF_PROMISC),
6156                                 (old_flags & IFF_PROMISC),
6157                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
6158                                 from_kuid(&init_user_ns, uid),
6159                                 from_kgid(&init_user_ns, gid),
6160                                 audit_get_sessionid(current));
6161                 }
6162
6163                 dev_change_rx_flags(dev, IFF_PROMISC);
6164         }
6165         if (notify)
6166                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
6167         return 0;
6168 }
6169
6170 /**
6171  *      dev_set_promiscuity     - update promiscuity count on a device
6172  *      @dev: device
6173  *      @inc: modifier
6174  *
6175  *      Add or remove promiscuity from a device. While the count in the device
6176  *      remains above zero the interface remains promiscuous. Once it hits zero
6177  *      the device reverts back to normal filtering operation. A negative inc
6178  *      value is used to drop promiscuity on the device.
6179  *      Return 0 if successful or a negative errno code on error.
6180  */
6181 int dev_set_promiscuity(struct net_device *dev, int inc)
6182 {
6183         unsigned int old_flags = dev->flags;
6184         int err;
6185
6186         err = __dev_set_promiscuity(dev, inc, true);
6187         if (err < 0)
6188                 return err;
6189         if (dev->flags != old_flags)
6190                 dev_set_rx_mode(dev);
6191         return err;
6192 }
6193 EXPORT_SYMBOL(dev_set_promiscuity);
6194
6195 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6196 {
6197         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6198
6199         ASSERT_RTNL();
6200
6201         dev->flags |= IFF_ALLMULTI;
6202         dev->allmulti += inc;
6203         if (dev->allmulti == 0) {
6204                 /*
6205                  * Avoid overflow.
6206                  * If inc causes overflow, untouch allmulti and return error.
6207                  */
6208                 if (inc < 0)
6209                         dev->flags &= ~IFF_ALLMULTI;
6210                 else {
6211                         dev->allmulti -= inc;
6212                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6213                                 dev->name);
6214                         return -EOVERFLOW;
6215                 }
6216         }
6217         if (dev->flags ^ old_flags) {
6218                 dev_change_rx_flags(dev, IFF_ALLMULTI);
6219                 dev_set_rx_mode(dev);
6220                 if (notify)
6221                         __dev_notify_flags(dev, old_flags,
6222                                            dev->gflags ^ old_gflags);
6223         }
6224         return 0;
6225 }
6226
6227 /**
6228  *      dev_set_allmulti        - update allmulti count on a device
6229  *      @dev: device
6230  *      @inc: modifier
6231  *
6232  *      Add or remove reception of all multicast frames to a device. While the
6233  *      count in the device remains above zero the interface remains listening
6234  *      to all interfaces. Once it hits zero the device reverts back to normal
6235  *      filtering operation. A negative @inc value is used to drop the counter
6236  *      when releasing a resource needing all multicasts.
6237  *      Return 0 if successful or a negative errno code on error.
6238  */
6239
6240 int dev_set_allmulti(struct net_device *dev, int inc)
6241 {
6242         return __dev_set_allmulti(dev, inc, true);
6243 }
6244 EXPORT_SYMBOL(dev_set_allmulti);
6245
6246 /*
6247  *      Upload unicast and multicast address lists to device and
6248  *      configure RX filtering. When the device doesn't support unicast
6249  *      filtering it is put in promiscuous mode while unicast addresses
6250  *      are present.
6251  */
6252 void __dev_set_rx_mode(struct net_device *dev)
6253 {
6254         const struct net_device_ops *ops = dev->netdev_ops;
6255
6256         /* dev_open will call this function so the list will stay sane. */
6257         if (!(dev->flags&IFF_UP))
6258                 return;
6259
6260         if (!netif_device_present(dev))
6261                 return;
6262
6263         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6264                 /* Unicast addresses changes may only happen under the rtnl,
6265                  * therefore calling __dev_set_promiscuity here is safe.
6266                  */
6267                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6268                         __dev_set_promiscuity(dev, 1, false);
6269                         dev->uc_promisc = true;
6270                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6271                         __dev_set_promiscuity(dev, -1, false);
6272                         dev->uc_promisc = false;
6273                 }
6274         }
6275
6276         if (ops->ndo_set_rx_mode)
6277                 ops->ndo_set_rx_mode(dev);
6278 }
6279
6280 void dev_set_rx_mode(struct net_device *dev)
6281 {
6282         netif_addr_lock_bh(dev);
6283         __dev_set_rx_mode(dev);
6284         netif_addr_unlock_bh(dev);
6285 }
6286
6287 /**
6288  *      dev_get_flags - get flags reported to userspace
6289  *      @dev: device
6290  *
6291  *      Get the combination of flag bits exported through APIs to userspace.
6292  */
6293 unsigned int dev_get_flags(const struct net_device *dev)
6294 {
6295         unsigned int flags;
6296
6297         flags = (dev->flags & ~(IFF_PROMISC |
6298                                 IFF_ALLMULTI |
6299                                 IFF_RUNNING |
6300                                 IFF_LOWER_UP |
6301                                 IFF_DORMANT)) |
6302                 (dev->gflags & (IFF_PROMISC |
6303                                 IFF_ALLMULTI));
6304
6305         if (netif_running(dev)) {
6306                 if (netif_oper_up(dev))
6307                         flags |= IFF_RUNNING;
6308                 if (netif_carrier_ok(dev))
6309                         flags |= IFF_LOWER_UP;
6310                 if (netif_dormant(dev))
6311                         flags |= IFF_DORMANT;
6312         }
6313
6314         return flags;
6315 }
6316 EXPORT_SYMBOL(dev_get_flags);
6317
6318 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6319 {
6320         unsigned int old_flags = dev->flags;
6321         int ret;
6322
6323         ASSERT_RTNL();
6324
6325         /*
6326          *      Set the flags on our device.
6327          */
6328
6329         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6330                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6331                                IFF_AUTOMEDIA)) |
6332                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6333                                     IFF_ALLMULTI));
6334
6335         /*
6336          *      Load in the correct multicast list now the flags have changed.
6337          */
6338
6339         if ((old_flags ^ flags) & IFF_MULTICAST)
6340                 dev_change_rx_flags(dev, IFF_MULTICAST);
6341
6342         dev_set_rx_mode(dev);
6343
6344         /*
6345          *      Have we downed the interface. We handle IFF_UP ourselves
6346          *      according to user attempts to set it, rather than blindly
6347          *      setting it.
6348          */
6349
6350         ret = 0;
6351         if ((old_flags ^ flags) & IFF_UP)
6352                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6353
6354         if ((flags ^ dev->gflags) & IFF_PROMISC) {
6355                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6356                 unsigned int old_flags = dev->flags;
6357
6358                 dev->gflags ^= IFF_PROMISC;
6359
6360                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6361                         if (dev->flags != old_flags)
6362                                 dev_set_rx_mode(dev);
6363         }
6364
6365         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6366            is important. Some (broken) drivers set IFF_PROMISC, when
6367            IFF_ALLMULTI is requested not asking us and not reporting.
6368          */
6369         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6370                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6371
6372                 dev->gflags ^= IFF_ALLMULTI;
6373                 __dev_set_allmulti(dev, inc, false);
6374         }
6375
6376         return ret;
6377 }
6378
6379 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6380                         unsigned int gchanges)
6381 {
6382         unsigned int changes = dev->flags ^ old_flags;
6383
6384         if (gchanges)
6385                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6386
6387         if (changes & IFF_UP) {
6388                 if (dev->flags & IFF_UP)
6389                         call_netdevice_notifiers(NETDEV_UP, dev);
6390                 else
6391                         call_netdevice_notifiers(NETDEV_DOWN, dev);
6392         }
6393
6394         if (dev->flags & IFF_UP &&
6395             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6396                 struct netdev_notifier_change_info change_info;
6397
6398                 change_info.flags_changed = changes;
6399                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6400                                               &change_info.info);
6401         }
6402 }
6403
6404 /**
6405  *      dev_change_flags - change device settings
6406  *      @dev: device
6407  *      @flags: device state flags
6408  *
6409  *      Change settings on device based state flags. The flags are
6410  *      in the userspace exported format.
6411  */
6412 int dev_change_flags(struct net_device *dev, unsigned int flags)
6413 {
6414         int ret;
6415         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6416
6417         ret = __dev_change_flags(dev, flags);
6418         if (ret < 0)
6419                 return ret;
6420
6421         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6422         __dev_notify_flags(dev, old_flags, changes);
6423         return ret;
6424 }
6425 EXPORT_SYMBOL(dev_change_flags);
6426
6427 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6428 {
6429         const struct net_device_ops *ops = dev->netdev_ops;
6430
6431         if (ops->ndo_change_mtu)
6432                 return ops->ndo_change_mtu(dev, new_mtu);
6433
6434         dev->mtu = new_mtu;
6435         return 0;
6436 }
6437
6438 /**
6439  *      dev_set_mtu - Change maximum transfer unit
6440  *      @dev: device
6441  *      @new_mtu: new transfer unit
6442  *
6443  *      Change the maximum transfer size of the network device.
6444  */
6445 int dev_set_mtu(struct net_device *dev, int new_mtu)
6446 {
6447         int err, orig_mtu;
6448
6449         if (new_mtu == dev->mtu)
6450                 return 0;
6451
6452         /* MTU must be positive, and in range */
6453         if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6454                 net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6455                                     dev->name, new_mtu, dev->min_mtu);
6456                 return -EINVAL;
6457         }
6458
6459         if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6460                 net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6461                                     dev->name, new_mtu, dev->max_mtu);
6462                 return -EINVAL;
6463         }
6464
6465         if (!netif_device_present(dev))
6466                 return -ENODEV;
6467
6468         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6469         err = notifier_to_errno(err);
6470         if (err)
6471                 return err;
6472
6473         orig_mtu = dev->mtu;
6474         err = __dev_set_mtu(dev, new_mtu);
6475
6476         if (!err) {
6477                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6478                 err = notifier_to_errno(err);
6479                 if (err) {
6480                         /* setting mtu back and notifying everyone again,
6481                          * so that they have a chance to revert changes.
6482                          */
6483                         __dev_set_mtu(dev, orig_mtu);
6484                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6485                 }
6486         }
6487         return err;
6488 }
6489 EXPORT_SYMBOL(dev_set_mtu);
6490
6491 /**
6492  *      dev_set_group - Change group this device belongs to
6493  *      @dev: device
6494  *      @new_group: group this device should belong to
6495  */
6496 void dev_set_group(struct net_device *dev, int new_group)
6497 {
6498         dev->group = new_group;
6499 }
6500 EXPORT_SYMBOL(dev_set_group);
6501
6502 /**
6503  *      dev_set_mac_address - Change Media Access Control Address
6504  *      @dev: device
6505  *      @sa: new address
6506  *
6507  *      Change the hardware (MAC) address of the device
6508  */
6509 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6510 {
6511         const struct net_device_ops *ops = dev->netdev_ops;
6512         int err;
6513
6514         if (!ops->ndo_set_mac_address)
6515                 return -EOPNOTSUPP;
6516         if (sa->sa_family != dev->type)
6517                 return -EINVAL;
6518         if (!netif_device_present(dev))
6519                 return -ENODEV;
6520         err = ops->ndo_set_mac_address(dev, sa);
6521         if (err)
6522                 return err;
6523         dev->addr_assign_type = NET_ADDR_SET;
6524         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6525         add_device_randomness(dev->dev_addr, dev->addr_len);
6526         return 0;
6527 }
6528 EXPORT_SYMBOL(dev_set_mac_address);
6529
6530 /**
6531  *      dev_change_carrier - Change device carrier
6532  *      @dev: device
6533  *      @new_carrier: new value
6534  *
6535  *      Change device carrier
6536  */
6537 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6538 {
6539         const struct net_device_ops *ops = dev->netdev_ops;
6540
6541         if (!ops->ndo_change_carrier)
6542                 return -EOPNOTSUPP;
6543         if (!netif_device_present(dev))
6544                 return -ENODEV;
6545         return ops->ndo_change_carrier(dev, new_carrier);
6546 }
6547 EXPORT_SYMBOL(dev_change_carrier);
6548
6549 /**
6550  *      dev_get_phys_port_id - Get device physical port ID
6551  *      @dev: device
6552  *      @ppid: port ID
6553  *
6554  *      Get device physical port ID
6555  */
6556 int dev_get_phys_port_id(struct net_device *dev,
6557                          struct netdev_phys_item_id *ppid)
6558 {
6559         const struct net_device_ops *ops = dev->netdev_ops;
6560
6561         if (!ops->ndo_get_phys_port_id)
6562                 return -EOPNOTSUPP;
6563         return ops->ndo_get_phys_port_id(dev, ppid);
6564 }
6565 EXPORT_SYMBOL(dev_get_phys_port_id);
6566
6567 /**
6568  *      dev_get_phys_port_name - Get device physical port name
6569  *      @dev: device
6570  *      @name: port name
6571  *      @len: limit of bytes to copy to name
6572  *
6573  *      Get device physical port name
6574  */
6575 int dev_get_phys_port_name(struct net_device *dev,
6576                            char *name, size_t len)
6577 {
6578         const struct net_device_ops *ops = dev->netdev_ops;
6579
6580         if (!ops->ndo_get_phys_port_name)
6581                 return -EOPNOTSUPP;
6582         return ops->ndo_get_phys_port_name(dev, name, len);
6583 }
6584 EXPORT_SYMBOL(dev_get_phys_port_name);
6585
6586 /**
6587  *      dev_change_proto_down - update protocol port state information
6588  *      @dev: device
6589  *      @proto_down: new value
6590  *
6591  *      This info can be used by switch drivers to set the phys state of the
6592  *      port.
6593  */
6594 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6595 {
6596         const struct net_device_ops *ops = dev->netdev_ops;
6597
6598         if (!ops->ndo_change_proto_down)
6599                 return -EOPNOTSUPP;
6600         if (!netif_device_present(dev))
6601                 return -ENODEV;
6602         return ops->ndo_change_proto_down(dev, proto_down);
6603 }
6604 EXPORT_SYMBOL(dev_change_proto_down);
6605
6606 /**
6607  *      dev_change_xdp_fd - set or clear a bpf program for a device rx path
6608  *      @dev: device
6609  *      @fd: new program fd or negative value to clear
6610  *      @flags: xdp-related flags
6611  *
6612  *      Set or clear a bpf program for a device
6613  */
6614 int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
6615 {
6616         const struct net_device_ops *ops = dev->netdev_ops;
6617         struct bpf_prog *prog = NULL;
6618         struct netdev_xdp xdp;
6619         int err;
6620
6621         ASSERT_RTNL();
6622
6623         if (!ops->ndo_xdp)
6624                 return -EOPNOTSUPP;
6625         if (fd >= 0) {
6626                 if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
6627                         memset(&xdp, 0, sizeof(xdp));
6628                         xdp.command = XDP_QUERY_PROG;
6629
6630                         err = ops->ndo_xdp(dev, &xdp);
6631                         if (err < 0)
6632                                 return err;
6633                         if (xdp.prog_attached)
6634                                 return -EBUSY;
6635                 }
6636
6637                 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6638                 if (IS_ERR(prog))
6639                         return PTR_ERR(prog);
6640         }
6641
6642         memset(&xdp, 0, sizeof(xdp));
6643         xdp.command = XDP_SETUP_PROG;
6644         xdp.prog = prog;
6645
6646         err = ops->ndo_xdp(dev, &xdp);
6647         if (err < 0 && prog)
6648                 bpf_prog_put(prog);
6649
6650         return err;
6651 }
6652 EXPORT_SYMBOL(dev_change_xdp_fd);
6653
6654 /**
6655  *      dev_new_index   -       allocate an ifindex
6656  *      @net: the applicable net namespace
6657  *
6658  *      Returns a suitable unique value for a new device interface
6659  *      number.  The caller must hold the rtnl semaphore or the
6660  *      dev_base_lock to be sure it remains unique.
6661  */
6662 static int dev_new_index(struct net *net)
6663 {
6664         int ifindex = net->ifindex;
6665         for (;;) {
6666                 if (++ifindex <= 0)
6667                         ifindex = 1;
6668                 if (!__dev_get_by_index(net, ifindex))
6669                         return net->ifindex = ifindex;
6670         }
6671 }
6672
6673 /* Delayed registration/unregisteration */
6674 static LIST_HEAD(net_todo_list);
6675 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6676
6677 static void net_set_todo(struct net_device *dev)
6678 {
6679         list_add_tail(&dev->todo_list, &net_todo_list);
6680         dev_net(dev)->dev_unreg_count++;
6681 }
6682
6683 static void rollback_registered_many(struct list_head *head)
6684 {
6685         struct net_device *dev, *tmp;
6686         LIST_HEAD(close_head);
6687
6688         BUG_ON(dev_boot_phase);
6689         ASSERT_RTNL();
6690
6691         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6692                 /* Some devices call without registering
6693                  * for initialization unwind. Remove those
6694                  * devices and proceed with the remaining.
6695                  */
6696                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6697                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6698                                  dev->name, dev);
6699
6700                         WARN_ON(1);
6701                         list_del(&dev->unreg_list);
6702                         continue;
6703                 }
6704                 dev->dismantle = true;
6705                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6706         }
6707
6708         /* If device is running, close it first. */
6709         list_for_each_entry(dev, head, unreg_list)
6710                 list_add_tail(&dev->close_list, &close_head);
6711         dev_close_many(&close_head, true);
6712
6713         list_for_each_entry(dev, head, unreg_list) {
6714                 /* And unlink it from device chain. */
6715                 unlist_netdevice(dev);
6716
6717                 dev->reg_state = NETREG_UNREGISTERING;
6718         }
6719         flush_all_backlogs();
6720
6721         synchronize_net();
6722
6723         list_for_each_entry(dev, head, unreg_list) {
6724                 struct sk_buff *skb = NULL;
6725
6726                 /* Shutdown queueing discipline. */
6727                 dev_shutdown(dev);
6728
6729
6730                 /* Notify protocols, that we are about to destroy
6731                    this device. They should clean all the things.
6732                 */
6733                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6734
6735                 if (!dev->rtnl_link_ops ||
6736                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6737                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6738                                                      GFP_KERNEL);
6739
6740                 /*
6741                  *      Flush the unicast and multicast chains
6742                  */
6743                 dev_uc_flush(dev);
6744                 dev_mc_flush(dev);
6745
6746                 if (dev->netdev_ops->ndo_uninit)
6747                         dev->netdev_ops->ndo_uninit(dev);
6748
6749                 if (skb)
6750                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6751
6752                 /* Notifier chain MUST detach us all upper devices. */
6753                 WARN_ON(netdev_has_any_upper_dev(dev));
6754                 WARN_ON(netdev_has_any_lower_dev(dev));
6755
6756                 /* Remove entries from kobject tree */
6757                 netdev_unregister_kobject(dev);
6758 #ifdef CONFIG_XPS
6759                 /* Remove XPS queueing entries */
6760                 netif_reset_xps_queues_gt(dev, 0);
6761 #endif
6762         }
6763
6764         synchronize_net();
6765
6766         list_for_each_entry(dev, head, unreg_list)
6767                 dev_put(dev);
6768 }
6769
6770 static void rollback_registered(struct net_device *dev)
6771 {
6772         LIST_HEAD(single);
6773
6774         list_add(&dev->unreg_list, &single);
6775         rollback_registered_many(&single);
6776         list_del(&single);
6777 }
6778
6779 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6780         struct net_device *upper, netdev_features_t features)
6781 {
6782         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6783         netdev_features_t feature;
6784         int feature_bit;
6785
6786         for_each_netdev_feature(&upper_disables, feature_bit) {
6787                 feature = __NETIF_F_BIT(feature_bit);
6788                 if (!(upper->wanted_features & feature)
6789                     && (features & feature)) {
6790                         netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6791                                    &feature, upper->name);
6792                         features &= ~feature;
6793                 }
6794         }
6795
6796         return features;
6797 }
6798
6799 static void netdev_sync_lower_features(struct net_device *upper,
6800         struct net_device *lower, netdev_features_t features)
6801 {
6802         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6803         netdev_features_t feature;
6804         int feature_bit;
6805
6806         for_each_netdev_feature(&upper_disables, feature_bit) {
6807                 feature = __NETIF_F_BIT(feature_bit);
6808                 if (!(features & feature) && (lower->features & feature)) {
6809                         netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6810                                    &feature, lower->name);
6811                         lower->wanted_features &= ~feature;
6812                         netdev_update_features(lower);
6813
6814                         if (unlikely(lower->features & feature))
6815                                 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6816                                             &feature, lower->name);
6817                 }
6818         }
6819 }
6820
6821 static netdev_features_t netdev_fix_features(struct net_device *dev,
6822         netdev_features_t features)
6823 {
6824         /* Fix illegal checksum combinations */
6825         if ((features & NETIF_F_HW_CSUM) &&
6826             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6827                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6828                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6829         }
6830
6831         /* TSO requires that SG is present as well. */
6832         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6833                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6834                 features &= ~NETIF_F_ALL_TSO;
6835         }
6836
6837         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6838                                         !(features & NETIF_F_IP_CSUM)) {
6839                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6840                 features &= ~NETIF_F_TSO;
6841                 features &= ~NETIF_F_TSO_ECN;
6842         }
6843
6844         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6845                                          !(features & NETIF_F_IPV6_CSUM)) {
6846                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6847                 features &= ~NETIF_F_TSO6;
6848         }
6849
6850         /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6851         if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6852                 features &= ~NETIF_F_TSO_MANGLEID;
6853
6854         /* TSO ECN requires that TSO is present as well. */
6855         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6856                 features &= ~NETIF_F_TSO_ECN;
6857
6858         /* Software GSO depends on SG. */
6859         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6860                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6861                 features &= ~NETIF_F_GSO;
6862         }
6863
6864         /* UFO needs SG and checksumming */
6865         if (features & NETIF_F_UFO) {
6866                 /* maybe split UFO into V4 and V6? */
6867                 if (!(features & NETIF_F_HW_CSUM) &&
6868                     ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6869                      (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6870                         netdev_dbg(dev,
6871                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6872                         features &= ~NETIF_F_UFO;
6873                 }
6874
6875                 if (!(features & NETIF_F_SG)) {
6876                         netdev_dbg(dev,
6877                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6878                         features &= ~NETIF_F_UFO;
6879                 }
6880         }
6881
6882         /* GSO partial features require GSO partial be set */
6883         if ((features & dev->gso_partial_features) &&
6884             !(features & NETIF_F_GSO_PARTIAL)) {
6885                 netdev_dbg(dev,
6886                            "Dropping partially supported GSO features since no GSO partial.\n");
6887                 features &= ~dev->gso_partial_features;
6888         }
6889
6890         return features;
6891 }
6892
6893 int __netdev_update_features(struct net_device *dev)
6894 {
6895         struct net_device *upper, *lower;
6896         netdev_features_t features;
6897         struct list_head *iter;
6898         int err = -1;
6899
6900         ASSERT_RTNL();
6901
6902         features = netdev_get_wanted_features(dev);
6903
6904         if (dev->netdev_ops->ndo_fix_features)
6905                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6906
6907         /* driver might be less strict about feature dependencies */
6908         features = netdev_fix_features(dev, features);
6909
6910         /* some features can't be enabled if they're off an an upper device */
6911         netdev_for_each_upper_dev_rcu(dev, upper, iter)
6912                 features = netdev_sync_upper_features(dev, upper, features);
6913
6914         if (dev->features == features)
6915                 goto sync_lower;
6916
6917         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6918                 &dev->features, &features);
6919
6920         if (dev->netdev_ops->ndo_set_features)
6921                 err = dev->netdev_ops->ndo_set_features(dev, features);
6922         else
6923                 err = 0;
6924
6925         if (unlikely(err < 0)) {
6926                 netdev_err(dev,
6927                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6928                         err, &features, &dev->features);
6929                 /* return non-0 since some features might have changed and
6930                  * it's better to fire a spurious notification than miss it
6931                  */
6932                 return -1;
6933         }
6934
6935 sync_lower:
6936         /* some features must be disabled on lower devices when disabled
6937          * on an upper device (think: bonding master or bridge)
6938          */
6939         netdev_for_each_lower_dev(dev, lower, iter)
6940                 netdev_sync_lower_features(dev, lower, features);
6941
6942         if (!err)
6943                 dev->features = features;
6944
6945         return err < 0 ? 0 : 1;
6946 }
6947
6948 /**
6949  *      netdev_update_features - recalculate device features
6950  *      @dev: the device to check
6951  *
6952  *      Recalculate dev->features set and send notifications if it
6953  *      has changed. Should be called after driver or hardware dependent
6954  *      conditions might have changed that influence the features.
6955  */
6956 void netdev_update_features(struct net_device *dev)
6957 {
6958         if (__netdev_update_features(dev))
6959                 netdev_features_change(dev);
6960 }
6961 EXPORT_SYMBOL(netdev_update_features);
6962
6963 /**
6964  *      netdev_change_features - recalculate device features
6965  *      @dev: the device to check
6966  *
6967  *      Recalculate dev->features set and send notifications even
6968  *      if they have not changed. Should be called instead of
6969  *      netdev_update_features() if also dev->vlan_features might
6970  *      have changed to allow the changes to be propagated to stacked
6971  *      VLAN devices.
6972  */
6973 void netdev_change_features(struct net_device *dev)
6974 {
6975         __netdev_update_features(dev);
6976         netdev_features_change(dev);
6977 }
6978 EXPORT_SYMBOL(netdev_change_features);
6979
6980 /**
6981  *      netif_stacked_transfer_operstate -      transfer operstate
6982  *      @rootdev: the root or lower level device to transfer state from
6983  *      @dev: the device to transfer operstate to
6984  *
6985  *      Transfer operational state from root to device. This is normally
6986  *      called when a stacking relationship exists between the root
6987  *      device and the device(a leaf device).
6988  */
6989 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6990                                         struct net_device *dev)
6991 {
6992         if (rootdev->operstate == IF_OPER_DORMANT)
6993                 netif_dormant_on(dev);
6994         else
6995                 netif_dormant_off(dev);
6996
6997         if (netif_carrier_ok(rootdev)) {
6998                 if (!netif_carrier_ok(dev))
6999                         netif_carrier_on(dev);
7000         } else {
7001                 if (netif_carrier_ok(dev))
7002                         netif_carrier_off(dev);
7003         }
7004 }
7005 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7006
7007 #ifdef CONFIG_SYSFS
7008 static int netif_alloc_rx_queues(struct net_device *dev)
7009 {
7010         unsigned int i, count = dev->num_rx_queues;
7011         struct netdev_rx_queue *rx;
7012         size_t sz = count * sizeof(*rx);
7013
7014         BUG_ON(count < 1);
7015
7016         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7017         if (!rx) {
7018                 rx = vzalloc(sz);
7019                 if (!rx)
7020                         return -ENOMEM;
7021         }
7022         dev->_rx = rx;
7023
7024         for (i = 0; i < count; i++)
7025                 rx[i].dev = dev;
7026         return 0;
7027 }
7028 #endif
7029
7030 static void netdev_init_one_queue(struct net_device *dev,
7031                                   struct netdev_queue *queue, void *_unused)
7032 {
7033         /* Initialize queue lock */
7034         spin_lock_init(&queue->_xmit_lock);
7035         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7036         queue->xmit_lock_owner = -1;
7037         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7038         queue->dev = dev;
7039 #ifdef CONFIG_BQL
7040         dql_init(&queue->dql, HZ);
7041 #endif
7042 }
7043
7044 static void netif_free_tx_queues(struct net_device *dev)
7045 {
7046         kvfree(dev->_tx);
7047 }
7048
7049 static int netif_alloc_netdev_queues(struct net_device *dev)
7050 {
7051         unsigned int count = dev->num_tx_queues;
7052         struct netdev_queue *tx;
7053         size_t sz = count * sizeof(*tx);
7054
7055         if (count < 1 || count > 0xffff)
7056                 return -EINVAL;
7057
7058         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7059         if (!tx) {
7060                 tx = vzalloc(sz);
7061                 if (!tx)
7062                         return -ENOMEM;
7063         }
7064         dev->_tx = tx;
7065
7066         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7067         spin_lock_init(&dev->tx_global_lock);
7068
7069         return 0;
7070 }
7071
7072 void netif_tx_stop_all_queues(struct net_device *dev)
7073 {
7074         unsigned int i;
7075
7076         for (i = 0; i < dev->num_tx_queues; i++) {
7077                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7078                 netif_tx_stop_queue(txq);
7079         }
7080 }
7081 EXPORT_SYMBOL(netif_tx_stop_all_queues);
7082
7083 /**
7084  *      register_netdevice      - register a network device
7085  *      @dev: device to register
7086  *
7087  *      Take a completed network device structure and add it to the kernel
7088  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7089  *      chain. 0 is returned on success. A negative errno code is returned
7090  *      on a failure to set up the device, or if the name is a duplicate.
7091  *
7092  *      Callers must hold the rtnl semaphore. You may want
7093  *      register_netdev() instead of this.
7094  *
7095  *      BUGS:
7096  *      The locking appears insufficient to guarantee two parallel registers
7097  *      will not get the same name.
7098  */
7099
7100 int register_netdevice(struct net_device *dev)
7101 {
7102         int ret;
7103         struct net *net = dev_net(dev);
7104
7105         BUG_ON(dev_boot_phase);
7106         ASSERT_RTNL();
7107
7108         might_sleep();
7109
7110         /* When net_device's are persistent, this will be fatal. */
7111         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7112         BUG_ON(!net);
7113
7114         spin_lock_init(&dev->addr_list_lock);
7115         netdev_set_addr_lockdep_class(dev);
7116
7117         ret = dev_get_valid_name(net, dev, dev->name);
7118         if (ret < 0)
7119                 goto out;
7120
7121         /* Init, if this function is available */
7122         if (dev->netdev_ops->ndo_init) {
7123                 ret = dev->netdev_ops->ndo_init(dev);
7124                 if (ret) {
7125                         if (ret > 0)
7126                                 ret = -EIO;
7127                         goto out;
7128                 }
7129         }
7130
7131         if (((dev->hw_features | dev->features) &
7132              NETIF_F_HW_VLAN_CTAG_FILTER) &&
7133             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7134              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7135                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7136                 ret = -EINVAL;
7137                 goto err_uninit;
7138         }
7139
7140         ret = -EBUSY;
7141         if (!dev->ifindex)
7142                 dev->ifindex = dev_new_index(net);
7143         else if (__dev_get_by_index(net, dev->ifindex))
7144                 goto err_uninit;
7145
7146         /* Transfer changeable features to wanted_features and enable
7147          * software offloads (GSO and GRO).
7148          */
7149         dev->hw_features |= NETIF_F_SOFT_FEATURES;
7150         dev->features |= NETIF_F_SOFT_FEATURES;
7151         dev->wanted_features = dev->features & dev->hw_features;
7152
7153         if (!(dev->flags & IFF_LOOPBACK))
7154                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
7155
7156         /* If IPv4 TCP segmentation offload is supported we should also
7157          * allow the device to enable segmenting the frame with the option
7158          * of ignoring a static IP ID value.  This doesn't enable the
7159          * feature itself but allows the user to enable it later.
7160          */
7161         if (dev->hw_features & NETIF_F_TSO)
7162                 dev->hw_features |= NETIF_F_TSO_MANGLEID;
7163         if (dev->vlan_features & NETIF_F_TSO)
7164                 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7165         if (dev->mpls_features & NETIF_F_TSO)
7166                 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7167         if (dev->hw_enc_features & NETIF_F_TSO)
7168                 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7169
7170         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7171          */
7172         dev->vlan_features |= NETIF_F_HIGHDMA;
7173
7174         /* Make NETIF_F_SG inheritable to tunnel devices.
7175          */
7176         dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7177
7178         /* Make NETIF_F_SG inheritable to MPLS.
7179          */
7180         dev->mpls_features |= NETIF_F_SG;
7181
7182         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7183         ret = notifier_to_errno(ret);
7184         if (ret)
7185                 goto err_uninit;
7186
7187         ret = netdev_register_kobject(dev);
7188         if (ret)
7189                 goto err_uninit;
7190         dev->reg_state = NETREG_REGISTERED;
7191
7192         __netdev_update_features(dev);
7193
7194         /*
7195          *      Default initial state at registry is that the
7196          *      device is present.
7197          */
7198
7199         set_bit(__LINK_STATE_PRESENT, &dev->state);
7200
7201         linkwatch_init_dev(dev);
7202
7203         dev_init_scheduler(dev);
7204         dev_hold(dev);
7205         list_netdevice(dev);
7206         add_device_randomness(dev->dev_addr, dev->addr_len);
7207
7208         /* If the device has permanent device address, driver should
7209          * set dev_addr and also addr_assign_type should be set to
7210          * NET_ADDR_PERM (default value).
7211          */
7212         if (dev->addr_assign_type == NET_ADDR_PERM)
7213                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7214
7215         /* Notify protocols, that a new device appeared. */
7216         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7217         ret = notifier_to_errno(ret);
7218         if (ret) {
7219                 rollback_registered(dev);
7220                 dev->reg_state = NETREG_UNREGISTERED;
7221         }
7222         /*
7223          *      Prevent userspace races by waiting until the network
7224          *      device is fully setup before sending notifications.
7225          */
7226         if (!dev->rtnl_link_ops ||
7227             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7228                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7229
7230 out:
7231         return ret;
7232
7233 err_uninit:
7234         if (dev->netdev_ops->ndo_uninit)
7235                 dev->netdev_ops->ndo_uninit(dev);
7236         goto out;
7237 }
7238 EXPORT_SYMBOL(register_netdevice);
7239
7240 /**
7241  *      init_dummy_netdev       - init a dummy network device for NAPI
7242  *      @dev: device to init
7243  *
7244  *      This takes a network device structure and initialize the minimum
7245  *      amount of fields so it can be used to schedule NAPI polls without
7246  *      registering a full blown interface. This is to be used by drivers
7247  *      that need to tie several hardware interfaces to a single NAPI
7248  *      poll scheduler due to HW limitations.
7249  */
7250 int init_dummy_netdev(struct net_device *dev)
7251 {
7252         /* Clear everything. Note we don't initialize spinlocks
7253          * are they aren't supposed to be taken by any of the
7254          * NAPI code and this dummy netdev is supposed to be
7255          * only ever used for NAPI polls
7256          */
7257         memset(dev, 0, sizeof(struct net_device));
7258
7259         /* make sure we BUG if trying to hit standard
7260          * register/unregister code path
7261          */
7262         dev->reg_state = NETREG_DUMMY;
7263
7264         /* NAPI wants this */
7265         INIT_LIST_HEAD(&dev->napi_list);
7266
7267         /* a dummy interface is started by default */
7268         set_bit(__LINK_STATE_PRESENT, &dev->state);
7269         set_bit(__LINK_STATE_START, &dev->state);
7270
7271         /* Note : We dont allocate pcpu_refcnt for dummy devices,
7272          * because users of this 'device' dont need to change
7273          * its refcount.
7274          */
7275
7276         return 0;
7277 }
7278 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7279
7280
7281 /**
7282  *      register_netdev - register a network device
7283  *      @dev: device to register
7284  *
7285  *      Take a completed network device structure and add it to the kernel
7286  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7287  *      chain. 0 is returned on success. A negative errno code is returned
7288  *      on a failure to set up the device, or if the name is a duplicate.
7289  *
7290  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
7291  *      and expands the device name if you passed a format string to
7292  *      alloc_netdev.
7293  */
7294 int register_netdev(struct net_device *dev)
7295 {
7296         int err;
7297
7298         rtnl_lock();
7299         err = register_netdevice(dev);
7300         rtnl_unlock();
7301         return err;
7302 }
7303 EXPORT_SYMBOL(register_netdev);
7304
7305 int netdev_refcnt_read(const struct net_device *dev)
7306 {
7307         int i, refcnt = 0;
7308
7309         for_each_possible_cpu(i)
7310                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7311         return refcnt;
7312 }
7313 EXPORT_SYMBOL(netdev_refcnt_read);
7314
7315 /**
7316  * netdev_wait_allrefs - wait until all references are gone.
7317  * @dev: target net_device
7318  *
7319  * This is called when unregistering network devices.
7320  *
7321  * Any protocol or device that holds a reference should register
7322  * for netdevice notification, and cleanup and put back the
7323  * reference if they receive an UNREGISTER event.
7324  * We can get stuck here if buggy protocols don't correctly
7325  * call dev_put.
7326  */
7327 static void netdev_wait_allrefs(struct net_device *dev)
7328 {
7329         unsigned long rebroadcast_time, warning_time;
7330         int refcnt;
7331
7332         linkwatch_forget_dev(dev);
7333
7334         rebroadcast_time = warning_time = jiffies;
7335         refcnt = netdev_refcnt_read(dev);
7336
7337         while (refcnt != 0) {
7338                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7339                         rtnl_lock();
7340
7341                         /* Rebroadcast unregister notification */
7342                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7343
7344                         __rtnl_unlock();
7345                         rcu_barrier();
7346                         rtnl_lock();
7347
7348                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7349                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7350                                      &dev->state)) {
7351                                 /* We must not have linkwatch events
7352                                  * pending on unregister. If this
7353                                  * happens, we simply run the queue
7354                                  * unscheduled, resulting in a noop
7355                                  * for this device.
7356                                  */
7357                                 linkwatch_run_queue();
7358                         }
7359
7360                         __rtnl_unlock();
7361
7362                         rebroadcast_time = jiffies;
7363                 }
7364
7365                 msleep(250);
7366
7367                 refcnt = netdev_refcnt_read(dev);
7368
7369                 if (time_after(jiffies, warning_time + 10 * HZ)) {
7370                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7371                                  dev->name, refcnt);
7372                         warning_time = jiffies;
7373                 }
7374         }
7375 }
7376
7377 /* The sequence is:
7378  *
7379  *      rtnl_lock();
7380  *      ...
7381  *      register_netdevice(x1);
7382  *      register_netdevice(x2);
7383  *      ...
7384  *      unregister_netdevice(y1);
7385  *      unregister_netdevice(y2);
7386  *      ...
7387  *      rtnl_unlock();
7388  *      free_netdev(y1);
7389  *      free_netdev(y2);
7390  *
7391  * We are invoked by rtnl_unlock().
7392  * This allows us to deal with problems:
7393  * 1) We can delete sysfs objects which invoke hotplug
7394  *    without deadlocking with linkwatch via keventd.
7395  * 2) Since we run with the RTNL semaphore not held, we can sleep
7396  *    safely in order to wait for the netdev refcnt to drop to zero.
7397  *
7398  * We must not return until all unregister events added during
7399  * the interval the lock was held have been completed.
7400  */
7401 void netdev_run_todo(void)
7402 {
7403         struct list_head list;
7404
7405         /* Snapshot list, allow later requests */
7406         list_replace_init(&net_todo_list, &list);
7407
7408         __rtnl_unlock();
7409
7410
7411         /* Wait for rcu callbacks to finish before next phase */
7412         if (!list_empty(&list))
7413                 rcu_barrier();
7414
7415         while (!list_empty(&list)) {
7416                 struct net_device *dev
7417                         = list_first_entry(&list, struct net_device, todo_list);
7418                 list_del(&dev->todo_list);
7419
7420                 rtnl_lock();
7421                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7422                 __rtnl_unlock();
7423
7424                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7425                         pr_err("network todo '%s' but state %d\n",
7426                                dev->name, dev->reg_state);
7427                         dump_stack();
7428                         continue;
7429                 }
7430
7431                 dev->reg_state = NETREG_UNREGISTERED;
7432
7433                 netdev_wait_allrefs(dev);
7434
7435                 /* paranoia */
7436                 BUG_ON(netdev_refcnt_read(dev));
7437                 BUG_ON(!list_empty(&dev->ptype_all));
7438                 BUG_ON(!list_empty(&dev->ptype_specific));
7439                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7440                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7441                 WARN_ON(dev->dn_ptr);
7442
7443                 if (dev->destructor)
7444                         dev->destructor(dev);
7445
7446                 /* Report a network device has been unregistered */
7447                 rtnl_lock();
7448                 dev_net(dev)->dev_unreg_count--;
7449                 __rtnl_unlock();
7450                 wake_up(&netdev_unregistering_wq);
7451
7452                 /* Free network device */
7453                 kobject_put(&dev->dev.kobj);
7454         }
7455 }
7456
7457 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7458  * all the same fields in the same order as net_device_stats, with only
7459  * the type differing, but rtnl_link_stats64 may have additional fields
7460  * at the end for newer counters.
7461  */
7462 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7463                              const struct net_device_stats *netdev_stats)
7464 {
7465 #if BITS_PER_LONG == 64
7466         BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7467         memcpy(stats64, netdev_stats, sizeof(*stats64));
7468         /* zero out counters that only exist in rtnl_link_stats64 */
7469         memset((char *)stats64 + sizeof(*netdev_stats), 0,
7470                sizeof(*stats64) - sizeof(*netdev_stats));
7471 #else
7472         size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7473         const unsigned long *src = (const unsigned long *)netdev_stats;
7474         u64 *dst = (u64 *)stats64;
7475
7476         BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7477         for (i = 0; i < n; i++)
7478                 dst[i] = src[i];
7479         /* zero out counters that only exist in rtnl_link_stats64 */
7480         memset((char *)stats64 + n * sizeof(u64), 0,
7481                sizeof(*stats64) - n * sizeof(u64));
7482 #endif
7483 }
7484 EXPORT_SYMBOL(netdev_stats_to_stats64);
7485
7486 /**
7487  *      dev_get_stats   - get network device statistics
7488  *      @dev: device to get statistics from
7489  *      @storage: place to store stats
7490  *
7491  *      Get network statistics from device. Return @storage.
7492  *      The device driver may provide its own method by setting
7493  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7494  *      otherwise the internal statistics structure is used.
7495  */
7496 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7497                                         struct rtnl_link_stats64 *storage)
7498 {
7499         const struct net_device_ops *ops = dev->netdev_ops;
7500
7501         if (ops->ndo_get_stats64) {
7502                 memset(storage, 0, sizeof(*storage));
7503                 ops->ndo_get_stats64(dev, storage);
7504         } else if (ops->ndo_get_stats) {
7505                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7506         } else {
7507                 netdev_stats_to_stats64(storage, &dev->stats);
7508         }
7509         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7510         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7511         storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7512         return storage;
7513 }
7514 EXPORT_SYMBOL(dev_get_stats);
7515
7516 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7517 {
7518         struct netdev_queue *queue = dev_ingress_queue(dev);
7519
7520 #ifdef CONFIG_NET_CLS_ACT
7521         if (queue)
7522                 return queue;
7523         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7524         if (!queue)
7525                 return NULL;
7526         netdev_init_one_queue(dev, queue, NULL);
7527         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7528         queue->qdisc_sleeping = &noop_qdisc;
7529         rcu_assign_pointer(dev->ingress_queue, queue);
7530 #endif
7531         return queue;
7532 }
7533
7534 static const struct ethtool_ops default_ethtool_ops;
7535
7536 void netdev_set_default_ethtool_ops(struct net_device *dev,
7537                                     const struct ethtool_ops *ops)
7538 {
7539         if (dev->ethtool_ops == &default_ethtool_ops)
7540                 dev->ethtool_ops = ops;
7541 }
7542 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7543
7544 void netdev_freemem(struct net_device *dev)
7545 {
7546         char *addr = (char *)dev - dev->padded;
7547
7548         kvfree(addr);
7549 }
7550
7551 /**
7552  *      alloc_netdev_mqs - allocate network device
7553  *      @sizeof_priv:           size of private data to allocate space for
7554  *      @name:                  device name format string
7555  *      @name_assign_type:      origin of device name
7556  *      @setup:                 callback to initialize device
7557  *      @txqs:                  the number of TX subqueues to allocate
7558  *      @rxqs:                  the number of RX subqueues to allocate
7559  *
7560  *      Allocates a struct net_device with private data area for driver use
7561  *      and performs basic initialization.  Also allocates subqueue structs
7562  *      for each queue on the device.
7563  */
7564 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7565                 unsigned char name_assign_type,
7566                 void (*setup)(struct net_device *),
7567                 unsigned int txqs, unsigned int rxqs)
7568 {
7569         struct net_device *dev;
7570         size_t alloc_size;
7571         struct net_device *p;
7572
7573         BUG_ON(strlen(name) >= sizeof(dev->name));
7574
7575         if (txqs < 1) {
7576                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7577                 return NULL;
7578         }
7579
7580 #ifdef CONFIG_SYSFS
7581         if (rxqs < 1) {
7582                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7583                 return NULL;
7584         }
7585 #endif
7586
7587         alloc_size = sizeof(struct net_device);
7588         if (sizeof_priv) {
7589                 /* ensure 32-byte alignment of private area */
7590                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7591                 alloc_size += sizeof_priv;
7592         }
7593         /* ensure 32-byte alignment of whole construct */
7594         alloc_size += NETDEV_ALIGN - 1;
7595
7596         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7597         if (!p)
7598                 p = vzalloc(alloc_size);
7599         if (!p)
7600                 return NULL;
7601
7602         dev = PTR_ALIGN(p, NETDEV_ALIGN);
7603         dev->padded = (char *)dev - (char *)p;
7604
7605         dev->pcpu_refcnt = alloc_percpu(int);
7606         if (!dev->pcpu_refcnt)
7607                 goto free_dev;
7608
7609         if (dev_addr_init(dev))
7610                 goto free_pcpu;
7611
7612         dev_mc_init(dev);
7613         dev_uc_init(dev);
7614
7615         dev_net_set(dev, &init_net);
7616
7617         dev->gso_max_size = GSO_MAX_SIZE;
7618         dev->gso_max_segs = GSO_MAX_SEGS;
7619
7620         INIT_LIST_HEAD(&dev->napi_list);
7621         INIT_LIST_HEAD(&dev->unreg_list);
7622         INIT_LIST_HEAD(&dev->close_list);
7623         INIT_LIST_HEAD(&dev->link_watch_list);
7624         INIT_LIST_HEAD(&dev->adj_list.upper);
7625         INIT_LIST_HEAD(&dev->adj_list.lower);
7626         INIT_LIST_HEAD(&dev->ptype_all);
7627         INIT_LIST_HEAD(&dev->ptype_specific);
7628 #ifdef CONFIG_NET_SCHED
7629         hash_init(dev->qdisc_hash);
7630 #endif
7631         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7632         setup(dev);
7633
7634         if (!dev->tx_queue_len) {
7635                 dev->priv_flags |= IFF_NO_QUEUE;
7636                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
7637         }
7638
7639         dev->num_tx_queues = txqs;
7640         dev->real_num_tx_queues = txqs;
7641         if (netif_alloc_netdev_queues(dev))
7642                 goto free_all;
7643
7644 #ifdef CONFIG_SYSFS
7645         dev->num_rx_queues = rxqs;
7646         dev->real_num_rx_queues = rxqs;
7647         if (netif_alloc_rx_queues(dev))
7648                 goto free_all;
7649 #endif
7650
7651         strcpy(dev->name, name);
7652         dev->name_assign_type = name_assign_type;
7653         dev->group = INIT_NETDEV_GROUP;
7654         if (!dev->ethtool_ops)
7655                 dev->ethtool_ops = &default_ethtool_ops;
7656
7657         nf_hook_ingress_init(dev);
7658
7659         return dev;
7660
7661 free_all:
7662         free_netdev(dev);
7663         return NULL;
7664
7665 free_pcpu:
7666         free_percpu(dev->pcpu_refcnt);
7667 free_dev:
7668         netdev_freemem(dev);
7669         return NULL;
7670 }
7671 EXPORT_SYMBOL(alloc_netdev_mqs);
7672
7673 /**
7674  *      free_netdev - free network device
7675  *      @dev: device
7676  *
7677  *      This function does the last stage of destroying an allocated device
7678  *      interface. The reference to the device object is released.
7679  *      If this is the last reference then it will be freed.
7680  *      Must be called in process context.
7681  */
7682 void free_netdev(struct net_device *dev)
7683 {
7684         struct napi_struct *p, *n;
7685
7686         might_sleep();
7687         netif_free_tx_queues(dev);
7688 #ifdef CONFIG_SYSFS
7689         kvfree(dev->_rx);
7690 #endif
7691
7692         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7693
7694         /* Flush device addresses */
7695         dev_addr_flush(dev);
7696
7697         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7698                 netif_napi_del(p);
7699
7700         free_percpu(dev->pcpu_refcnt);
7701         dev->pcpu_refcnt = NULL;
7702
7703         /*  Compatibility with error handling in drivers */
7704         if (dev->reg_state == NETREG_UNINITIALIZED) {
7705                 netdev_freemem(dev);
7706                 return;
7707         }
7708
7709         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7710         dev->reg_state = NETREG_RELEASED;
7711
7712         /* will free via device release */
7713         put_device(&dev->dev);
7714 }
7715 EXPORT_SYMBOL(free_netdev);
7716
7717 /**
7718  *      synchronize_net -  Synchronize with packet receive processing
7719  *
7720  *      Wait for packets currently being received to be done.
7721  *      Does not block later packets from starting.
7722  */
7723 void synchronize_net(void)
7724 {
7725         might_sleep();
7726         if (rtnl_is_locked())
7727                 synchronize_rcu_expedited();
7728         else
7729                 synchronize_rcu();
7730 }
7731 EXPORT_SYMBOL(synchronize_net);
7732
7733 /**
7734  *      unregister_netdevice_queue - remove device from the kernel
7735  *      @dev: device
7736  *      @head: list
7737  *
7738  *      This function shuts down a device interface and removes it
7739  *      from the kernel tables.
7740  *      If head not NULL, device is queued to be unregistered later.
7741  *
7742  *      Callers must hold the rtnl semaphore.  You may want
7743  *      unregister_netdev() instead of this.
7744  */
7745
7746 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7747 {
7748         ASSERT_RTNL();
7749
7750         if (head) {
7751                 list_move_tail(&dev->unreg_list, head);
7752         } else {
7753                 rollback_registered(dev);
7754                 /* Finish processing unregister after unlock */
7755                 net_set_todo(dev);
7756         }
7757 }
7758 EXPORT_SYMBOL(unregister_netdevice_queue);
7759
7760 /**
7761  *      unregister_netdevice_many - unregister many devices
7762  *      @head: list of devices
7763  *
7764  *  Note: As most callers use a stack allocated list_head,
7765  *  we force a list_del() to make sure stack wont be corrupted later.
7766  */
7767 void unregister_netdevice_many(struct list_head *head)
7768 {
7769         struct net_device *dev;
7770
7771         if (!list_empty(head)) {
7772                 rollback_registered_many(head);
7773                 list_for_each_entry(dev, head, unreg_list)
7774                         net_set_todo(dev);
7775                 list_del(head);
7776         }
7777 }
7778 EXPORT_SYMBOL(unregister_netdevice_many);
7779
7780 /**
7781  *      unregister_netdev - remove device from the kernel
7782  *      @dev: device
7783  *
7784  *      This function shuts down a device interface and removes it
7785  *      from the kernel tables.
7786  *
7787  *      This is just a wrapper for unregister_netdevice that takes
7788  *      the rtnl semaphore.  In general you want to use this and not
7789  *      unregister_netdevice.
7790  */
7791 void unregister_netdev(struct net_device *dev)
7792 {
7793         rtnl_lock();
7794         unregister_netdevice(dev);
7795         rtnl_unlock();
7796 }
7797 EXPORT_SYMBOL(unregister_netdev);
7798
7799 /**
7800  *      dev_change_net_namespace - move device to different nethost namespace
7801  *      @dev: device
7802  *      @net: network namespace
7803  *      @pat: If not NULL name pattern to try if the current device name
7804  *            is already taken in the destination network namespace.
7805  *
7806  *      This function shuts down a device interface and moves it
7807  *      to a new network namespace. On success 0 is returned, on
7808  *      a failure a netagive errno code is returned.
7809  *
7810  *      Callers must hold the rtnl semaphore.
7811  */
7812
7813 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7814 {
7815         int err;
7816
7817         ASSERT_RTNL();
7818
7819         /* Don't allow namespace local devices to be moved. */
7820         err = -EINVAL;
7821         if (dev->features & NETIF_F_NETNS_LOCAL)
7822                 goto out;
7823
7824         /* Ensure the device has been registrered */
7825         if (dev->reg_state != NETREG_REGISTERED)
7826                 goto out;
7827
7828         /* Get out if there is nothing todo */
7829         err = 0;
7830         if (net_eq(dev_net(dev), net))
7831                 goto out;
7832
7833         /* Pick the destination device name, and ensure
7834          * we can use it in the destination network namespace.
7835          */
7836         err = -EEXIST;
7837         if (__dev_get_by_name(net, dev->name)) {
7838                 /* We get here if we can't use the current device name */
7839                 if (!pat)
7840                         goto out;
7841                 if (dev_get_valid_name(net, dev, pat) < 0)
7842                         goto out;
7843         }
7844
7845         /*
7846          * And now a mini version of register_netdevice unregister_netdevice.
7847          */
7848
7849         /* If device is running close it first. */
7850         dev_close(dev);
7851
7852         /* And unlink it from device chain */
7853         err = -ENODEV;
7854         unlist_netdevice(dev);
7855
7856         synchronize_net();
7857
7858         /* Shutdown queueing discipline. */
7859         dev_shutdown(dev);
7860
7861         /* Notify protocols, that we are about to destroy
7862            this device. They should clean all the things.
7863
7864            Note that dev->reg_state stays at NETREG_REGISTERED.
7865            This is wanted because this way 8021q and macvlan know
7866            the device is just moving and can keep their slaves up.
7867         */
7868         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7869         rcu_barrier();
7870         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7871         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7872
7873         /*
7874          *      Flush the unicast and multicast chains
7875          */
7876         dev_uc_flush(dev);
7877         dev_mc_flush(dev);
7878
7879         /* Send a netdev-removed uevent to the old namespace */
7880         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7881         netdev_adjacent_del_links(dev);
7882
7883         /* Actually switch the network namespace */
7884         dev_net_set(dev, net);
7885
7886         /* If there is an ifindex conflict assign a new one */
7887         if (__dev_get_by_index(net, dev->ifindex))
7888                 dev->ifindex = dev_new_index(net);
7889
7890         /* Send a netdev-add uevent to the new namespace */
7891         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7892         netdev_adjacent_add_links(dev);
7893
7894         /* Fixup kobjects */
7895         err = device_rename(&dev->dev, dev->name);
7896         WARN_ON(err);
7897
7898         /* Add the device back in the hashes */
7899         list_netdevice(dev);
7900
7901         /* Notify protocols, that a new device appeared. */
7902         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7903
7904         /*
7905          *      Prevent userspace races by waiting until the network
7906          *      device is fully setup before sending notifications.
7907          */
7908         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7909
7910         synchronize_net();
7911         err = 0;
7912 out:
7913         return err;
7914 }
7915 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7916
7917 static int dev_cpu_dead(unsigned int oldcpu)
7918 {
7919         struct sk_buff **list_skb;
7920         struct sk_buff *skb;
7921         unsigned int cpu;
7922         struct softnet_data *sd, *oldsd;
7923
7924         local_irq_disable();
7925         cpu = smp_processor_id();
7926         sd = &per_cpu(softnet_data, cpu);
7927         oldsd = &per_cpu(softnet_data, oldcpu);
7928
7929         /* Find end of our completion_queue. */
7930         list_skb = &sd->completion_queue;
7931         while (*list_skb)
7932                 list_skb = &(*list_skb)->next;
7933         /* Append completion queue from offline CPU. */
7934         *list_skb = oldsd->completion_queue;
7935         oldsd->completion_queue = NULL;
7936
7937         /* Append output queue from offline CPU. */
7938         if (oldsd->output_queue) {
7939                 *sd->output_queue_tailp = oldsd->output_queue;
7940                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7941                 oldsd->output_queue = NULL;
7942                 oldsd->output_queue_tailp = &oldsd->output_queue;
7943         }
7944         /* Append NAPI poll list from offline CPU, with one exception :
7945          * process_backlog() must be called by cpu owning percpu backlog.
7946          * We properly handle process_queue & input_pkt_queue later.
7947          */
7948         while (!list_empty(&oldsd->poll_list)) {
7949                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7950                                                             struct napi_struct,
7951                                                             poll_list);
7952
7953                 list_del_init(&napi->poll_list);
7954                 if (napi->poll == process_backlog)
7955                         napi->state = 0;
7956                 else
7957                         ____napi_schedule(sd, napi);
7958         }
7959
7960         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7961         local_irq_enable();
7962
7963         /* Process offline CPU's input_pkt_queue */
7964         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7965                 netif_rx_ni(skb);
7966                 input_queue_head_incr(oldsd);
7967         }
7968         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7969                 netif_rx_ni(skb);
7970                 input_queue_head_incr(oldsd);
7971         }
7972
7973         return 0;
7974 }
7975
7976 /**
7977  *      netdev_increment_features - increment feature set by one
7978  *      @all: current feature set
7979  *      @one: new feature set
7980  *      @mask: mask feature set
7981  *
7982  *      Computes a new feature set after adding a device with feature set
7983  *      @one to the master device with current feature set @all.  Will not
7984  *      enable anything that is off in @mask. Returns the new feature set.
7985  */
7986 netdev_features_t netdev_increment_features(netdev_features_t all,
7987         netdev_features_t one, netdev_features_t mask)
7988 {
7989         if (mask & NETIF_F_HW_CSUM)
7990                 mask |= NETIF_F_CSUM_MASK;
7991         mask |= NETIF_F_VLAN_CHALLENGED;
7992
7993         all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
7994         all &= one | ~NETIF_F_ALL_FOR_ALL;
7995
7996         /* If one device supports hw checksumming, set for all. */
7997         if (all & NETIF_F_HW_CSUM)
7998                 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
7999
8000         return all;
8001 }
8002 EXPORT_SYMBOL(netdev_increment_features);
8003
8004 static struct hlist_head * __net_init netdev_create_hash(void)
8005 {
8006         int i;
8007         struct hlist_head *hash;
8008
8009         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8010         if (hash != NULL)
8011                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
8012                         INIT_HLIST_HEAD(&hash[i]);
8013
8014         return hash;
8015 }
8016
8017 /* Initialize per network namespace state */
8018 static int __net_init netdev_init(struct net *net)
8019 {
8020         if (net != &init_net)
8021                 INIT_LIST_HEAD(&net->dev_base_head);
8022
8023         net->dev_name_head = netdev_create_hash();
8024         if (net->dev_name_head == NULL)
8025                 goto err_name;
8026
8027         net->dev_index_head = netdev_create_hash();
8028         if (net->dev_index_head == NULL)
8029                 goto err_idx;
8030
8031         return 0;
8032
8033 err_idx:
8034         kfree(net->dev_name_head);
8035 err_name:
8036         return -ENOMEM;
8037 }
8038
8039 /**
8040  *      netdev_drivername - network driver for the device
8041  *      @dev: network device
8042  *
8043  *      Determine network driver for device.
8044  */
8045 const char *netdev_drivername(const struct net_device *dev)
8046 {
8047         const struct device_driver *driver;
8048         const struct device *parent;
8049         const char *empty = "";
8050
8051         parent = dev->dev.parent;
8052         if (!parent)
8053                 return empty;
8054
8055         driver = parent->driver;
8056         if (driver && driver->name)
8057                 return driver->name;
8058         return empty;
8059 }
8060
8061 static void __netdev_printk(const char *level, const struct net_device *dev,
8062                             struct va_format *vaf)
8063 {
8064         if (dev && dev->dev.parent) {
8065                 dev_printk_emit(level[1] - '0',
8066                                 dev->dev.parent,
8067                                 "%s %s %s%s: %pV",
8068                                 dev_driver_string(dev->dev.parent),
8069                                 dev_name(dev->dev.parent),
8070                                 netdev_name(dev), netdev_reg_state(dev),
8071                                 vaf);
8072         } else if (dev) {
8073                 printk("%s%s%s: %pV",
8074                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
8075         } else {
8076                 printk("%s(NULL net_device): %pV", level, vaf);
8077         }
8078 }
8079
8080 void netdev_printk(const char *level, const struct net_device *dev,
8081                    const char *format, ...)
8082 {
8083         struct va_format vaf;
8084         va_list args;
8085
8086         va_start(args, format);
8087
8088         vaf.fmt = format;
8089         vaf.va = &args;
8090
8091         __netdev_printk(level, dev, &vaf);
8092
8093         va_end(args);
8094 }
8095 EXPORT_SYMBOL(netdev_printk);
8096
8097 #define define_netdev_printk_level(func, level)                 \
8098 void func(const struct net_device *dev, const char *fmt, ...)   \
8099 {                                                               \
8100         struct va_format vaf;                                   \
8101         va_list args;                                           \
8102                                                                 \
8103         va_start(args, fmt);                                    \
8104                                                                 \
8105         vaf.fmt = fmt;                                          \
8106         vaf.va = &args;                                         \
8107                                                                 \
8108         __netdev_printk(level, dev, &vaf);                      \
8109                                                                 \
8110         va_end(args);                                           \
8111 }                                                               \
8112 EXPORT_SYMBOL(func);
8113
8114 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8115 define_netdev_printk_level(netdev_alert, KERN_ALERT);
8116 define_netdev_printk_level(netdev_crit, KERN_CRIT);
8117 define_netdev_printk_level(netdev_err, KERN_ERR);
8118 define_netdev_printk_level(netdev_warn, KERN_WARNING);
8119 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8120 define_netdev_printk_level(netdev_info, KERN_INFO);
8121
8122 static void __net_exit netdev_exit(struct net *net)
8123 {
8124         kfree(net->dev_name_head);
8125         kfree(net->dev_index_head);
8126 }
8127
8128 static struct pernet_operations __net_initdata netdev_net_ops = {
8129         .init = netdev_init,
8130         .exit = netdev_exit,
8131 };
8132
8133 static void __net_exit default_device_exit(struct net *net)
8134 {
8135         struct net_device *dev, *aux;
8136         /*
8137          * Push all migratable network devices back to the
8138          * initial network namespace
8139          */
8140         rtnl_lock();
8141         for_each_netdev_safe(net, dev, aux) {
8142                 int err;
8143                 char fb_name[IFNAMSIZ];
8144
8145                 /* Ignore unmoveable devices (i.e. loopback) */
8146                 if (dev->features & NETIF_F_NETNS_LOCAL)
8147                         continue;
8148
8149                 /* Leave virtual devices for the generic cleanup */
8150                 if (dev->rtnl_link_ops)
8151                         continue;
8152
8153                 /* Push remaining network devices to init_net */
8154                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8155                 err = dev_change_net_namespace(dev, &init_net, fb_name);
8156                 if (err) {
8157                         pr_emerg("%s: failed to move %s to init_net: %d\n",
8158                                  __func__, dev->name, err);
8159                         BUG();
8160                 }
8161         }
8162         rtnl_unlock();
8163 }
8164
8165 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8166 {
8167         /* Return with the rtnl_lock held when there are no network
8168          * devices unregistering in any network namespace in net_list.
8169          */
8170         struct net *net;
8171         bool unregistering;
8172         DEFINE_WAIT_FUNC(wait, woken_wake_function);
8173
8174         add_wait_queue(&netdev_unregistering_wq, &wait);
8175         for (;;) {
8176                 unregistering = false;
8177                 rtnl_lock();
8178                 list_for_each_entry(net, net_list, exit_list) {
8179                         if (net->dev_unreg_count > 0) {
8180                                 unregistering = true;
8181                                 break;
8182                         }
8183                 }
8184                 if (!unregistering)
8185                         break;
8186                 __rtnl_unlock();
8187
8188                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8189         }
8190         remove_wait_queue(&netdev_unregistering_wq, &wait);
8191 }
8192
8193 static void __net_exit default_device_exit_batch(struct list_head *net_list)
8194 {
8195         /* At exit all network devices most be removed from a network
8196          * namespace.  Do this in the reverse order of registration.
8197          * Do this across as many network namespaces as possible to
8198          * improve batching efficiency.
8199          */
8200         struct net_device *dev;
8201         struct net *net;
8202         LIST_HEAD(dev_kill_list);
8203
8204         /* To prevent network device cleanup code from dereferencing
8205          * loopback devices or network devices that have been freed
8206          * wait here for all pending unregistrations to complete,
8207          * before unregistring the loopback device and allowing the
8208          * network namespace be freed.
8209          *
8210          * The netdev todo list containing all network devices
8211          * unregistrations that happen in default_device_exit_batch
8212          * will run in the rtnl_unlock() at the end of
8213          * default_device_exit_batch.
8214          */
8215         rtnl_lock_unregistering(net_list);
8216         list_for_each_entry(net, net_list, exit_list) {
8217                 for_each_netdev_reverse(net, dev) {
8218                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8219                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8220                         else
8221                                 unregister_netdevice_queue(dev, &dev_kill_list);
8222                 }
8223         }
8224         unregister_netdevice_many(&dev_kill_list);
8225         rtnl_unlock();
8226 }
8227
8228 static struct pernet_operations __net_initdata default_device_ops = {
8229         .exit = default_device_exit,
8230         .exit_batch = default_device_exit_batch,
8231 };
8232
8233 /*
8234  *      Initialize the DEV module. At boot time this walks the device list and
8235  *      unhooks any devices that fail to initialise (normally hardware not
8236  *      present) and leaves us with a valid list of present and active devices.
8237  *
8238  */
8239
8240 /*
8241  *       This is called single threaded during boot, so no need
8242  *       to take the rtnl semaphore.
8243  */
8244 static int __init net_dev_init(void)
8245 {
8246         int i, rc = -ENOMEM;
8247
8248         BUG_ON(!dev_boot_phase);
8249
8250         if (dev_proc_init())
8251                 goto out;
8252
8253         if (netdev_kobject_init())
8254                 goto out;
8255
8256         INIT_LIST_HEAD(&ptype_all);
8257         for (i = 0; i < PTYPE_HASH_SIZE; i++)
8258                 INIT_LIST_HEAD(&ptype_base[i]);
8259
8260         INIT_LIST_HEAD(&offload_base);
8261
8262         if (register_pernet_subsys(&netdev_net_ops))
8263                 goto out;
8264
8265         /*
8266          *      Initialise the packet receive queues.
8267          */
8268
8269         for_each_possible_cpu(i) {
8270                 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8271                 struct softnet_data *sd = &per_cpu(softnet_data, i);
8272
8273                 INIT_WORK(flush, flush_backlog);
8274
8275                 skb_queue_head_init(&sd->input_pkt_queue);
8276                 skb_queue_head_init(&sd->process_queue);
8277                 INIT_LIST_HEAD(&sd->poll_list);
8278                 sd->output_queue_tailp = &sd->output_queue;
8279 #ifdef CONFIG_RPS
8280                 sd->csd.func = rps_trigger_softirq;
8281                 sd->csd.info = sd;
8282                 sd->cpu = i;
8283 #endif
8284
8285                 sd->backlog.poll = process_backlog;
8286                 sd->backlog.weight = weight_p;
8287         }
8288
8289         dev_boot_phase = 0;
8290
8291         /* The loopback device is special if any other network devices
8292          * is present in a network namespace the loopback device must
8293          * be present. Since we now dynamically allocate and free the
8294          * loopback device ensure this invariant is maintained by
8295          * keeping the loopback device as the first device on the
8296          * list of network devices.  Ensuring the loopback devices
8297          * is the first device that appears and the last network device
8298          * that disappears.
8299          */
8300         if (register_pernet_device(&loopback_net_ops))
8301                 goto out;
8302
8303         if (register_pernet_device(&default_device_ops))
8304                 goto out;
8305
8306         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8307         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8308
8309         rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
8310                                        NULL, dev_cpu_dead);
8311         WARN_ON(rc < 0);
8312         dst_subsys_init();
8313         rc = 0;
8314 out:
8315         return rc;
8316 }
8317
8318 subsys_initcall(net_dev_init);