2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
35 * Ulises Alonso : Frame number limit removal and
36 * packet_set_ring memory leak.
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll
42 * Johann Baudy : Added TX RING.
43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
55 #include <linux/types.h>
57 #include <linux/capability.h>
58 #include <linux/fcntl.h>
59 #include <linux/socket.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_packet.h>
64 #include <linux/wireless.h>
65 #include <linux/kernel.h>
66 #include <linux/kmod.h>
67 #include <linux/slab.h>
68 #include <linux/vmalloc.h>
69 #include <net/net_namespace.h>
71 #include <net/protocol.h>
72 #include <linux/skbuff.h>
74 #include <linux/errno.h>
75 #include <linux/timer.h>
76 #include <asm/uaccess.h>
77 #include <asm/ioctls.h>
79 #include <asm/cacheflush.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/poll.h>
84 #include <linux/module.h>
85 #include <linux/init.h>
86 #include <linux/mutex.h>
87 #include <linux/if_vlan.h>
88 #include <linux/virtio_net.h>
89 #include <linux/errqueue.h>
90 #include <linux/net_tstamp.h>
91 #include <linux/if_arp.h>
94 #include <net/inet_common.h>
101 - if device has no dev->hard_header routine, it adds and removes ll header
102 inside itself. In this case ll header is invisible outside of device,
103 but higher levels still should reserve dev->hard_header_len.
104 Some devices are enough clever to reallocate skb, when header
105 will not fit to reserved space (tunnel), another ones are silly
107 - packet socket receives packets with pulled ll header,
108 so that SOCK_RAW should push it back.
113 Incoming, dev->hard_header!=NULL
114 mac_header -> ll header
117 Outgoing, dev->hard_header!=NULL
118 mac_header -> ll header
121 Incoming, dev->hard_header==NULL
122 mac_header -> UNKNOWN position. It is very likely, that it points to ll
123 header. PPP makes it, that is wrong, because introduce
124 assymetry between rx and tx paths.
127 Outgoing, dev->hard_header==NULL
128 mac_header -> data. ll header is still not built!
132 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
138 dev->hard_header != NULL
139 mac_header -> ll header
142 dev->hard_header == NULL (ll header is added by device, we cannot control it)
146 We should set nh.raw on output to correct posistion,
147 packet classifier depends on it.
150 /* Private packet socket structures. */
152 /* identical to struct packet_mreq except it has
153 * a longer address field.
155 struct packet_mreq_max {
157 unsigned short mr_type;
158 unsigned short mr_alen;
159 unsigned char mr_address[MAX_ADDR_LEN];
163 struct tpacket_hdr *h1;
164 struct tpacket2_hdr *h2;
165 struct tpacket3_hdr *h3;
169 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
170 int closing, int tx_ring);
172 #define V3_ALIGNMENT (8)
174 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
176 #define BLK_PLUS_PRIV(sz_of_priv) \
177 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
179 #define PGV_FROM_VMALLOC 1
181 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
182 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
183 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
184 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
185 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
186 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
187 #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
190 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
191 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
192 struct packet_type *pt, struct net_device *orig_dev);
194 static void *packet_previous_frame(struct packet_sock *po,
195 struct packet_ring_buffer *rb,
197 static void packet_increment_head(struct packet_ring_buffer *buff);
198 static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
199 struct tpacket_block_desc *);
200 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
201 struct packet_sock *);
202 static void prb_retire_current_block(struct tpacket_kbdq_core *,
203 struct packet_sock *, unsigned int status);
204 static int prb_queue_frozen(struct tpacket_kbdq_core *);
205 static void prb_open_block(struct tpacket_kbdq_core *,
206 struct tpacket_block_desc *);
207 static void prb_retire_rx_blk_timer_expired(unsigned long);
208 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
209 static void prb_init_blk_timer(struct packet_sock *,
210 struct tpacket_kbdq_core *,
211 void (*func) (unsigned long));
212 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
213 static void prb_clear_rxhash(struct tpacket_kbdq_core *,
214 struct tpacket3_hdr *);
215 static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
216 struct tpacket3_hdr *);
217 static void packet_flush_mclist(struct sock *sk);
219 struct packet_skb_cb {
220 unsigned int origlen;
222 struct sockaddr_pkt pkt;
223 struct sockaddr_ll ll;
227 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
229 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
230 #define GET_PBLOCK_DESC(x, bid) \
231 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
232 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
233 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
234 #define GET_NEXT_PRB_BLK_NUM(x) \
235 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
236 ((x)->kactive_blk_num+1) : 0)
238 static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
239 static void __fanout_link(struct sock *sk, struct packet_sock *po);
241 /* register_prot_hook must be invoked with the po->bind_lock held,
242 * or from a context in which asynchronous accesses to the packet
243 * socket is not possible (packet_create()).
245 static void register_prot_hook(struct sock *sk)
247 struct packet_sock *po = pkt_sk(sk);
250 __fanout_link(sk, po);
252 dev_add_pack(&po->prot_hook);
258 /* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
259 * held. If the sync parameter is true, we will temporarily drop
260 * the po->bind_lock and do a synchronize_net to make sure no
261 * asynchronous packet processing paths still refer to the elements
262 * of po->prot_hook. If the sync parameter is false, it is the
263 * callers responsibility to take care of this.
265 static void __unregister_prot_hook(struct sock *sk, bool sync)
267 struct packet_sock *po = pkt_sk(sk);
271 __fanout_unlink(sk, po);
273 __dev_remove_pack(&po->prot_hook);
277 spin_unlock(&po->bind_lock);
279 spin_lock(&po->bind_lock);
283 static void unregister_prot_hook(struct sock *sk, bool sync)
285 struct packet_sock *po = pkt_sk(sk);
288 __unregister_prot_hook(sk, sync);
291 static inline __pure struct page *pgv_to_page(void *addr)
293 if (is_vmalloc_addr(addr))
294 return vmalloc_to_page(addr);
295 return virt_to_page(addr);
298 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
300 union tpacket_uhdr h;
303 switch (po->tp_version) {
305 h.h1->tp_status = status;
306 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
309 h.h2->tp_status = status;
310 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
314 WARN(1, "TPACKET version not supported.\n");
321 static int __packet_get_status(struct packet_sock *po, void *frame)
323 union tpacket_uhdr h;
328 switch (po->tp_version) {
330 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
331 return h.h1->tp_status;
333 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
334 return h.h2->tp_status;
337 WARN(1, "TPACKET version not supported.\n");
343 static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
346 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
349 if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
350 ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
351 return TP_STATUS_TS_SYS_HARDWARE;
352 if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
353 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
354 return TP_STATUS_TS_RAW_HARDWARE;
357 if (ktime_to_timespec_cond(skb->tstamp, ts))
358 return TP_STATUS_TS_SOFTWARE;
363 static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
366 union tpacket_uhdr h;
370 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
374 switch (po->tp_version) {
376 h.h1->tp_sec = ts.tv_sec;
377 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
380 h.h2->tp_sec = ts.tv_sec;
381 h.h2->tp_nsec = ts.tv_nsec;
385 WARN(1, "TPACKET version not supported.\n");
389 /* one flush is safe, as both fields always lie on the same cacheline */
390 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
396 static void *packet_lookup_frame(struct packet_sock *po,
397 struct packet_ring_buffer *rb,
398 unsigned int position,
401 unsigned int pg_vec_pos, frame_offset;
402 union tpacket_uhdr h;
404 pg_vec_pos = position / rb->frames_per_block;
405 frame_offset = position % rb->frames_per_block;
407 h.raw = rb->pg_vec[pg_vec_pos].buffer +
408 (frame_offset * rb->frame_size);
410 if (status != __packet_get_status(po, h.raw))
416 static void *packet_current_frame(struct packet_sock *po,
417 struct packet_ring_buffer *rb,
420 return packet_lookup_frame(po, rb, rb->head, status);
423 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
425 del_timer_sync(&pkc->retire_blk_timer);
428 static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
430 struct sk_buff_head *rb_queue)
432 struct tpacket_kbdq_core *pkc;
434 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
436 spin_lock(&rb_queue->lock);
437 pkc->delete_blk_timer = 1;
438 spin_unlock(&rb_queue->lock);
440 prb_del_retire_blk_timer(pkc);
443 static void prb_init_blk_timer(struct packet_sock *po,
444 struct tpacket_kbdq_core *pkc,
445 void (*func) (unsigned long))
447 init_timer(&pkc->retire_blk_timer);
448 pkc->retire_blk_timer.data = (long)po;
449 pkc->retire_blk_timer.function = func;
450 pkc->retire_blk_timer.expires = jiffies;
453 static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
455 struct tpacket_kbdq_core *pkc;
460 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
461 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
464 static int prb_calc_retire_blk_tmo(struct packet_sock *po,
465 int blk_size_in_bytes)
467 struct net_device *dev;
468 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
469 struct ethtool_cmd ecmd;
474 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
475 if (unlikely(!dev)) {
477 return DEFAULT_PRB_RETIRE_TOV;
479 err = __ethtool_get_settings(dev, &ecmd);
480 speed = ethtool_cmd_speed(&ecmd);
484 * If the link speed is so slow you don't really
485 * need to worry about perf anyways
487 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
488 return DEFAULT_PRB_RETIRE_TOV;
495 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
507 static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
508 union tpacket_req_u *req_u)
510 p1->feature_req_word = req_u->req3.tp_feature_req_word;
513 static void init_prb_bdqc(struct packet_sock *po,
514 struct packet_ring_buffer *rb,
516 union tpacket_req_u *req_u, int tx_ring)
518 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
519 struct tpacket_block_desc *pbd;
521 memset(p1, 0x0, sizeof(*p1));
523 p1->knxt_seq_num = 1;
525 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
526 p1->pkblk_start = pg_vec[0].buffer;
527 p1->kblk_size = req_u->req3.tp_block_size;
528 p1->knum_blocks = req_u->req3.tp_block_nr;
529 p1->hdrlen = po->tp_hdrlen;
530 p1->version = po->tp_version;
531 p1->last_kactive_blk_num = 0;
532 po->stats.stats3.tp_freeze_q_cnt = 0;
533 if (req_u->req3.tp_retire_blk_tov)
534 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
536 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
537 req_u->req3.tp_block_size);
538 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
539 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
541 prb_init_ft_ops(p1, req_u);
542 prb_setup_retire_blk_timer(po, tx_ring);
543 prb_open_block(p1, pbd);
546 /* Do NOT update the last_blk_num first.
547 * Assumes sk_buff_head lock is held.
549 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
551 mod_timer(&pkc->retire_blk_timer,
552 jiffies + pkc->tov_in_jiffies);
553 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
558 * 1) We refresh the timer only when we open a block.
559 * By doing this we don't waste cycles refreshing the timer
560 * on packet-by-packet basis.
562 * With a 1MB block-size, on a 1Gbps line, it will take
563 * i) ~8 ms to fill a block + ii) memcpy etc.
564 * In this cut we are not accounting for the memcpy time.
566 * So, if the user sets the 'tmo' to 10ms then the timer
567 * will never fire while the block is still getting filled
568 * (which is what we want). However, the user could choose
569 * to close a block early and that's fine.
571 * But when the timer does fire, we check whether or not to refresh it.
572 * Since the tmo granularity is in msecs, it is not too expensive
573 * to refresh the timer, lets say every '8' msecs.
574 * Either the user can set the 'tmo' or we can derive it based on
575 * a) line-speed and b) block-size.
576 * prb_calc_retire_blk_tmo() calculates the tmo.
579 static void prb_retire_rx_blk_timer_expired(unsigned long data)
581 struct packet_sock *po = (struct packet_sock *)data;
582 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
584 struct tpacket_block_desc *pbd;
586 spin_lock(&po->sk.sk_receive_queue.lock);
588 frozen = prb_queue_frozen(pkc);
589 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
591 if (unlikely(pkc->delete_blk_timer))
594 /* We only need to plug the race when the block is partially filled.
596 * lock(); increment BLOCK_NUM_PKTS; unlock()
597 * copy_bits() is in progress ...
598 * timer fires on other cpu:
599 * we can't retire the current block because copy_bits
603 if (BLOCK_NUM_PKTS(pbd)) {
604 while (atomic_read(&pkc->blk_fill_in_prog)) {
605 /* Waiting for skb_copy_bits to finish... */
610 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
612 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
613 if (!prb_dispatch_next_block(pkc, po))
618 /* Case 1. Queue was frozen because user-space was
621 if (prb_curr_blk_in_use(pkc, pbd)) {
623 * Ok, user-space is still behind.
624 * So just refresh the timer.
628 /* Case 2. queue was frozen,user-space caught up,
629 * now the link went idle && the timer fired.
630 * We don't have a block to close.So we open this
631 * block and restart the timer.
632 * opening a block thaws the queue,restarts timer
633 * Thawing/timer-refresh is a side effect.
635 prb_open_block(pkc, pbd);
642 _prb_refresh_rx_retire_blk_timer(pkc);
645 spin_unlock(&po->sk.sk_receive_queue.lock);
648 static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
649 struct tpacket_block_desc *pbd1, __u32 status)
651 /* Flush everything minus the block header */
653 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
658 /* Skip the block header(we know header WILL fit in 4K) */
661 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
662 for (; start < end; start += PAGE_SIZE)
663 flush_dcache_page(pgv_to_page(start));
668 /* Now update the block status. */
670 BLOCK_STATUS(pbd1) = status;
672 /* Flush the block header */
674 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
676 flush_dcache_page(pgv_to_page(start));
686 * 2) Increment active_blk_num
688 * Note:We DONT refresh the timer on purpose.
689 * Because almost always the next block will be opened.
691 static void prb_close_block(struct tpacket_kbdq_core *pkc1,
692 struct tpacket_block_desc *pbd1,
693 struct packet_sock *po, unsigned int stat)
695 __u32 status = TP_STATUS_USER | stat;
697 struct tpacket3_hdr *last_pkt;
698 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
700 if (po->stats.stats3.tp_drops)
701 status |= TP_STATUS_LOSING;
703 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
704 last_pkt->tp_next_offset = 0;
706 /* Get the ts of the last pkt */
707 if (BLOCK_NUM_PKTS(pbd1)) {
708 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
709 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
711 /* Ok, we tmo'd - so get the current time */
714 h1->ts_last_pkt.ts_sec = ts.tv_sec;
715 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
720 /* Flush the block */
721 prb_flush_block(pkc1, pbd1, status);
723 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
726 static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
728 pkc->reset_pending_on_curr_blk = 0;
732 * Side effect of opening a block:
734 * 1) prb_queue is thawed.
735 * 2) retire_blk_timer is refreshed.
738 static void prb_open_block(struct tpacket_kbdq_core *pkc1,
739 struct tpacket_block_desc *pbd1)
742 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
746 /* We could have just memset this but we will lose the
747 * flexibility of making the priv area sticky
750 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
751 BLOCK_NUM_PKTS(pbd1) = 0;
752 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
756 h1->ts_first_pkt.ts_sec = ts.tv_sec;
757 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
759 pkc1->pkblk_start = (char *)pbd1;
760 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
762 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
763 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
765 pbd1->version = pkc1->version;
766 pkc1->prev = pkc1->nxt_offset;
767 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
769 prb_thaw_queue(pkc1);
770 _prb_refresh_rx_retire_blk_timer(pkc1);
776 * Queue freeze logic:
777 * 1) Assume tp_block_nr = 8 blocks.
778 * 2) At time 't0', user opens Rx ring.
779 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
780 * 4) user-space is either sleeping or processing block '0'.
781 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
782 * it will close block-7,loop around and try to fill block '0'.
784 * __packet_lookup_frame_in_block
785 * prb_retire_current_block()
786 * prb_dispatch_next_block()
787 * |->(BLOCK_STATUS == USER) evaluates to true
788 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
789 * 6) Now there are two cases:
790 * 6.1) Link goes idle right after the queue is frozen.
791 * But remember, the last open_block() refreshed the timer.
792 * When this timer expires,it will refresh itself so that we can
793 * re-open block-0 in near future.
794 * 6.2) Link is busy and keeps on receiving packets. This is a simple
795 * case and __packet_lookup_frame_in_block will check if block-0
796 * is free and can now be re-used.
798 static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
799 struct packet_sock *po)
801 pkc->reset_pending_on_curr_blk = 1;
802 po->stats.stats3.tp_freeze_q_cnt++;
805 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
808 * If the next block is free then we will dispatch it
809 * and return a good offset.
810 * Else, we will freeze the queue.
811 * So, caller must check the return value.
813 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
814 struct packet_sock *po)
816 struct tpacket_block_desc *pbd;
820 /* 1. Get current block num */
821 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
823 /* 2. If this block is currently in_use then freeze the queue */
824 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
825 prb_freeze_queue(pkc, po);
831 * open this block and return the offset where the first packet
832 * needs to get stored.
834 prb_open_block(pkc, pbd);
835 return (void *)pkc->nxt_offset;
838 static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
839 struct packet_sock *po, unsigned int status)
841 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
843 /* retire/close the current block */
844 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
846 * Plug the case where copy_bits() is in progress on
847 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
848 * have space to copy the pkt in the current block and
849 * called prb_retire_current_block()
851 * We don't need to worry about the TMO case because
852 * the timer-handler already handled this case.
854 if (!(status & TP_STATUS_BLK_TMO)) {
855 while (atomic_read(&pkc->blk_fill_in_prog)) {
856 /* Waiting for skb_copy_bits to finish... */
860 prb_close_block(pkc, pbd, po, status);
865 static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
866 struct tpacket_block_desc *pbd)
868 return TP_STATUS_USER & BLOCK_STATUS(pbd);
871 static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
873 return pkc->reset_pending_on_curr_blk;
876 static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
878 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
879 atomic_dec(&pkc->blk_fill_in_prog);
882 static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
883 struct tpacket3_hdr *ppd)
885 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
888 static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
889 struct tpacket3_hdr *ppd)
891 ppd->hv1.tp_rxhash = 0;
894 static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
895 struct tpacket3_hdr *ppd)
897 if (vlan_tx_tag_present(pkc->skb)) {
898 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
899 ppd->tp_status = TP_STATUS_VLAN_VALID;
901 ppd->hv1.tp_vlan_tci = 0;
902 ppd->tp_status = TP_STATUS_AVAILABLE;
906 static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
907 struct tpacket3_hdr *ppd)
909 prb_fill_vlan_info(pkc, ppd);
911 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
912 prb_fill_rxhash(pkc, ppd);
914 prb_clear_rxhash(pkc, ppd);
917 static void prb_fill_curr_block(char *curr,
918 struct tpacket_kbdq_core *pkc,
919 struct tpacket_block_desc *pbd,
922 struct tpacket3_hdr *ppd;
924 ppd = (struct tpacket3_hdr *)curr;
925 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
927 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
928 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
929 BLOCK_NUM_PKTS(pbd) += 1;
930 atomic_inc(&pkc->blk_fill_in_prog);
931 prb_run_all_ft_ops(pkc, ppd);
934 /* Assumes caller has the sk->rx_queue.lock */
935 static void *__packet_lookup_frame_in_block(struct packet_sock *po,
941 struct tpacket_kbdq_core *pkc;
942 struct tpacket_block_desc *pbd;
945 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
946 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
948 /* Queue is frozen when user space is lagging behind */
949 if (prb_queue_frozen(pkc)) {
951 * Check if that last block which caused the queue to freeze,
952 * is still in_use by user-space.
954 if (prb_curr_blk_in_use(pkc, pbd)) {
955 /* Can't record this packet */
959 * Ok, the block was released by user-space.
960 * Now let's open that block.
961 * opening a block also thaws the queue.
962 * Thawing is a side effect.
964 prb_open_block(pkc, pbd);
969 curr = pkc->nxt_offset;
971 end = (char *)pbd + pkc->kblk_size;
973 /* first try the current block */
974 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
975 prb_fill_curr_block(curr, pkc, pbd, len);
979 /* Ok, close the current block */
980 prb_retire_current_block(pkc, po, 0);
982 /* Now, try to dispatch the next block */
983 curr = (char *)prb_dispatch_next_block(pkc, po);
985 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
986 prb_fill_curr_block(curr, pkc, pbd, len);
991 * No free blocks are available.user_space hasn't caught up yet.
992 * Queue was just frozen and now this packet will get dropped.
997 static void *packet_current_rx_frame(struct packet_sock *po,
999 int status, unsigned int len)
1002 switch (po->tp_version) {
1005 curr = packet_lookup_frame(po, &po->rx_ring,
1006 po->rx_ring.head, status);
1009 return __packet_lookup_frame_in_block(po, skb, status, len);
1011 WARN(1, "TPACKET version not supported\n");
1017 static void *prb_lookup_block(struct packet_sock *po,
1018 struct packet_ring_buffer *rb,
1022 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1023 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
1025 if (status != BLOCK_STATUS(pbd))
1030 static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1033 if (rb->prb_bdqc.kactive_blk_num)
1034 prev = rb->prb_bdqc.kactive_blk_num-1;
1036 prev = rb->prb_bdqc.knum_blocks-1;
1040 /* Assumes caller has held the rx_queue.lock */
1041 static void *__prb_previous_block(struct packet_sock *po,
1042 struct packet_ring_buffer *rb,
1045 unsigned int previous = prb_previous_blk_num(rb);
1046 return prb_lookup_block(po, rb, previous, status);
1049 static void *packet_previous_rx_frame(struct packet_sock *po,
1050 struct packet_ring_buffer *rb,
1053 if (po->tp_version <= TPACKET_V2)
1054 return packet_previous_frame(po, rb, status);
1056 return __prb_previous_block(po, rb, status);
1059 static void packet_increment_rx_head(struct packet_sock *po,
1060 struct packet_ring_buffer *rb)
1062 switch (po->tp_version) {
1065 return packet_increment_head(rb);
1068 WARN(1, "TPACKET version not supported.\n");
1074 static void *packet_previous_frame(struct packet_sock *po,
1075 struct packet_ring_buffer *rb,
1078 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1079 return packet_lookup_frame(po, rb, previous, status);
1082 static void packet_increment_head(struct packet_ring_buffer *buff)
1084 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1087 static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1089 struct sock *sk = &po->sk;
1092 if (po->prot_hook.func != tpacket_rcv)
1093 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1096 spin_lock(&sk->sk_receive_queue.lock);
1097 if (po->tp_version == TPACKET_V3)
1098 has_room = prb_lookup_block(po, &po->rx_ring,
1099 po->rx_ring.prb_bdqc.kactive_blk_num,
1102 has_room = packet_lookup_frame(po, &po->rx_ring,
1105 spin_unlock(&sk->sk_receive_queue.lock);
1110 static void packet_sock_destruct(struct sock *sk)
1112 skb_queue_purge(&sk->sk_error_queue);
1114 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1115 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1117 if (!sock_flag(sk, SOCK_DEAD)) {
1118 pr_err("Attempt to release alive packet socket: %p\n", sk);
1122 sk_refcnt_debug_dec(sk);
1125 static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1127 int x = atomic_read(&f->rr_cur) + 1;
1135 static unsigned int fanout_demux_hash(struct packet_fanout *f,
1136 struct sk_buff *skb,
1139 return (((u64)skb->rxhash) * num) >> 32;
1142 static unsigned int fanout_demux_lb(struct packet_fanout *f,
1143 struct sk_buff *skb,
1148 cur = atomic_read(&f->rr_cur);
1149 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1150 fanout_rr_next(f, num))) != cur)
1155 static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1156 struct sk_buff *skb,
1159 return smp_processor_id() % num;
1162 static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1163 struct sk_buff *skb,
1164 unsigned int idx, unsigned int skip,
1169 i = j = min_t(int, f->next[idx], num - 1);
1171 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
1183 static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1185 return f->flags & (flag >> 8);
1188 static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1189 struct packet_type *pt, struct net_device *orig_dev)
1191 struct packet_fanout *f = pt->af_packet_priv;
1192 unsigned int num = f->num_members;
1193 struct packet_sock *po;
1196 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1203 case PACKET_FANOUT_HASH:
1205 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1206 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
1210 skb_get_rxhash(skb);
1211 idx = fanout_demux_hash(f, skb, num);
1213 case PACKET_FANOUT_LB:
1214 idx = fanout_demux_lb(f, skb, num);
1216 case PACKET_FANOUT_CPU:
1217 idx = fanout_demux_cpu(f, skb, num);
1219 case PACKET_FANOUT_ROLLOVER:
1220 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
1224 po = pkt_sk(f->arr[idx]);
1225 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
1226 unlikely(!packet_rcv_has_room(po, skb))) {
1227 idx = fanout_demux_rollover(f, skb, idx, idx, num);
1228 po = pkt_sk(f->arr[idx]);
1231 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1234 DEFINE_MUTEX(fanout_mutex);
1235 EXPORT_SYMBOL_GPL(fanout_mutex);
1236 static LIST_HEAD(fanout_list);
1238 static void __fanout_link(struct sock *sk, struct packet_sock *po)
1240 struct packet_fanout *f = po->fanout;
1242 spin_lock(&f->lock);
1243 f->arr[f->num_members] = sk;
1246 spin_unlock(&f->lock);
1249 static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1251 struct packet_fanout *f = po->fanout;
1254 spin_lock(&f->lock);
1255 for (i = 0; i < f->num_members; i++) {
1256 if (f->arr[i] == sk)
1259 BUG_ON(i >= f->num_members);
1260 f->arr[i] = f->arr[f->num_members - 1];
1262 spin_unlock(&f->lock);
1265 static bool match_fanout_group(struct packet_type *ptype, struct sock * sk)
1267 if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout)
1273 static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1275 struct packet_sock *po = pkt_sk(sk);
1276 struct packet_fanout *f, *match;
1277 u8 type = type_flags & 0xff;
1278 u8 flags = type_flags >> 8;
1282 case PACKET_FANOUT_ROLLOVER:
1283 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1285 case PACKET_FANOUT_HASH:
1286 case PACKET_FANOUT_LB:
1287 case PACKET_FANOUT_CPU:
1299 mutex_lock(&fanout_mutex);
1301 list_for_each_entry(f, &fanout_list, list) {
1303 read_pnet(&f->net) == sock_net(sk)) {
1309 if (match && match->flags != flags)
1313 match = kzalloc(sizeof(*match), GFP_KERNEL);
1316 write_pnet(&match->net, sock_net(sk));
1319 match->flags = flags;
1320 atomic_set(&match->rr_cur, 0);
1321 INIT_LIST_HEAD(&match->list);
1322 spin_lock_init(&match->lock);
1323 atomic_set(&match->sk_ref, 0);
1324 match->prot_hook.type = po->prot_hook.type;
1325 match->prot_hook.dev = po->prot_hook.dev;
1326 match->prot_hook.func = packet_rcv_fanout;
1327 match->prot_hook.af_packet_priv = match;
1328 match->prot_hook.id_match = match_fanout_group;
1329 dev_add_pack(&match->prot_hook);
1330 list_add(&match->list, &fanout_list);
1333 if (match->type == type &&
1334 match->prot_hook.type == po->prot_hook.type &&
1335 match->prot_hook.dev == po->prot_hook.dev) {
1337 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1338 __dev_remove_pack(&po->prot_hook);
1340 atomic_inc(&match->sk_ref);
1341 __fanout_link(sk, po);
1346 mutex_unlock(&fanout_mutex);
1350 static void fanout_release(struct sock *sk)
1352 struct packet_sock *po = pkt_sk(sk);
1353 struct packet_fanout *f;
1359 mutex_lock(&fanout_mutex);
1362 if (atomic_dec_and_test(&f->sk_ref)) {
1364 dev_remove_pack(&f->prot_hook);
1367 mutex_unlock(&fanout_mutex);
1370 static const struct proto_ops packet_ops;
1372 static const struct proto_ops packet_ops_spkt;
1374 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1375 struct packet_type *pt, struct net_device *orig_dev)
1378 struct sockaddr_pkt *spkt;
1381 * When we registered the protocol we saved the socket in the data
1382 * field for just this event.
1385 sk = pt->af_packet_priv;
1388 * Yank back the headers [hope the device set this
1389 * right or kerboom...]
1391 * Incoming packets have ll header pulled,
1394 * For outgoing ones skb->data == skb_mac_header(skb)
1395 * so that this procedure is noop.
1398 if (skb->pkt_type == PACKET_LOOPBACK)
1401 if (!net_eq(dev_net(dev), sock_net(sk)))
1404 skb = skb_share_check(skb, GFP_ATOMIC);
1408 /* drop any routing info */
1411 /* drop conntrack reference */
1414 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1416 skb_push(skb, skb->data - skb_mac_header(skb));
1419 * The SOCK_PACKET socket receives _all_ frames.
1422 spkt->spkt_family = dev->type;
1423 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1424 spkt->spkt_protocol = skb->protocol;
1427 * Charge the memory to the socket. This is done specifically
1428 * to prevent sockets using all the memory up.
1431 if (sock_queue_rcv_skb(sk, skb) == 0)
1442 * Output a raw packet to a device layer. This bypasses all the other
1443 * protocol layers and you must therefore supply it with a complete frame
1446 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1447 struct msghdr *msg, size_t len)
1449 struct sock *sk = sock->sk;
1450 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1451 struct sk_buff *skb = NULL;
1452 struct net_device *dev;
1458 * Get and verify the address.
1462 if (msg->msg_namelen < sizeof(struct sockaddr))
1464 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1465 proto = saddr->spkt_protocol;
1467 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1470 * Find the device first to size check it
1473 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1476 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1482 if (!(dev->flags & IFF_UP))
1486 * You may not queue a frame bigger than the mtu. This is the lowest level
1487 * raw protocol and you must do your own fragmentation at this level.
1490 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1491 if (!netif_supports_nofcs(dev)) {
1492 err = -EPROTONOSUPPORT;
1495 extra_len = 4; /* We're doing our own CRC */
1499 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1503 size_t reserved = LL_RESERVED_SPACE(dev);
1504 int tlen = dev->needed_tailroom;
1505 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1508 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1511 /* FIXME: Save some space for broken drivers that write a hard
1512 * header at transmission time by themselves. PPP is the notable
1513 * one here. This should really be fixed at the driver level.
1515 skb_reserve(skb, reserved);
1516 skb_reset_network_header(skb);
1518 /* Try to align data part correctly */
1523 skb_reset_network_header(skb);
1525 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1531 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
1532 /* Earlier code assumed this would be a VLAN pkt,
1533 * double-check this now that we have the actual
1536 struct ethhdr *ehdr;
1537 skb_reset_mac_header(skb);
1538 ehdr = eth_hdr(skb);
1539 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1545 skb->protocol = proto;
1547 skb->priority = sk->sk_priority;
1548 skb->mark = sk->sk_mark;
1550 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1552 if (unlikely(extra_len == 4))
1555 skb_probe_transport_header(skb, 0);
1557 dev_queue_xmit(skb);
1568 static unsigned int run_filter(const struct sk_buff *skb,
1569 const struct sock *sk,
1572 struct sk_filter *filter;
1575 filter = rcu_dereference(sk->sk_filter);
1577 res = SK_RUN_FILTER(filter, skb);
1584 * This function makes lazy skb cloning in hope that most of packets
1585 * are discarded by BPF.
1587 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1588 * and skb->cb are mangled. It works because (and until) packets
1589 * falling here are owned by current CPU. Output packets are cloned
1590 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1591 * sequencially, so that if we return skb to original state on exit,
1592 * we will not harm anyone.
1595 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1596 struct packet_type *pt, struct net_device *orig_dev)
1599 struct sockaddr_ll *sll;
1600 struct packet_sock *po;
1601 u8 *skb_head = skb->data;
1602 int skb_len = skb->len;
1603 unsigned int snaplen, res;
1605 if (skb->pkt_type == PACKET_LOOPBACK)
1608 sk = pt->af_packet_priv;
1611 if (!net_eq(dev_net(dev), sock_net(sk)))
1616 if (dev->header_ops) {
1617 /* The device has an explicit notion of ll header,
1618 * exported to higher levels.
1620 * Otherwise, the device hides details of its frame
1621 * structure, so that corresponding packet head is
1622 * never delivered to user.
1624 if (sk->sk_type != SOCK_DGRAM)
1625 skb_push(skb, skb->data - skb_mac_header(skb));
1626 else if (skb->pkt_type == PACKET_OUTGOING) {
1627 /* Special case: outgoing packets have ll header at head */
1628 skb_pull(skb, skb_network_offset(skb));
1634 res = run_filter(skb, sk, snaplen);
1636 goto drop_n_restore;
1640 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1643 if (skb_shared(skb)) {
1644 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1648 if (skb_head != skb->data) {
1649 skb->data = skb_head;
1656 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1659 sll = &PACKET_SKB_CB(skb)->sa.ll;
1660 sll->sll_family = AF_PACKET;
1661 sll->sll_hatype = dev->type;
1662 sll->sll_protocol = skb->protocol;
1663 sll->sll_pkttype = skb->pkt_type;
1664 if (unlikely(po->origdev))
1665 sll->sll_ifindex = orig_dev->ifindex;
1667 sll->sll_ifindex = dev->ifindex;
1669 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1671 PACKET_SKB_CB(skb)->origlen = skb->len;
1673 if (pskb_trim(skb, snaplen))
1676 skb_set_owner_r(skb, sk);
1680 /* drop conntrack reference */
1683 spin_lock(&sk->sk_receive_queue.lock);
1684 po->stats.stats1.tp_packets++;
1685 skb->dropcount = atomic_read(&sk->sk_drops);
1686 __skb_queue_tail(&sk->sk_receive_queue, skb);
1687 spin_unlock(&sk->sk_receive_queue.lock);
1688 sk->sk_data_ready(sk, skb->len);
1692 spin_lock(&sk->sk_receive_queue.lock);
1693 po->stats.stats1.tp_drops++;
1694 atomic_inc(&sk->sk_drops);
1695 spin_unlock(&sk->sk_receive_queue.lock);
1698 if (skb_head != skb->data && skb_shared(skb)) {
1699 skb->data = skb_head;
1707 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1708 struct packet_type *pt, struct net_device *orig_dev)
1711 struct packet_sock *po;
1712 struct sockaddr_ll *sll;
1713 union tpacket_uhdr h;
1714 u8 *skb_head = skb->data;
1715 int skb_len = skb->len;
1716 unsigned int snaplen, res;
1717 unsigned long status = TP_STATUS_USER;
1718 unsigned short macoff, netoff, hdrlen;
1719 struct sk_buff *copy_skb = NULL;
1723 if (skb->pkt_type == PACKET_LOOPBACK)
1726 sk = pt->af_packet_priv;
1729 if (!net_eq(dev_net(dev), sock_net(sk)))
1732 if (dev->header_ops) {
1733 if (sk->sk_type != SOCK_DGRAM)
1734 skb_push(skb, skb->data - skb_mac_header(skb));
1735 else if (skb->pkt_type == PACKET_OUTGOING) {
1736 /* Special case: outgoing packets have ll header at head */
1737 skb_pull(skb, skb_network_offset(skb));
1741 if (skb->ip_summed == CHECKSUM_PARTIAL)
1742 status |= TP_STATUS_CSUMNOTREADY;
1746 res = run_filter(skb, sk, snaplen);
1748 goto drop_n_restore;
1752 if (sk->sk_type == SOCK_DGRAM) {
1753 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1756 unsigned int maclen = skb_network_offset(skb);
1757 netoff = TPACKET_ALIGN(po->tp_hdrlen +
1758 (maclen < 16 ? 16 : maclen)) +
1760 macoff = netoff - maclen;
1762 if (po->tp_version <= TPACKET_V2) {
1763 if (macoff + snaplen > po->rx_ring.frame_size) {
1764 if (po->copy_thresh &&
1765 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1766 if (skb_shared(skb)) {
1767 copy_skb = skb_clone(skb, GFP_ATOMIC);
1769 copy_skb = skb_get(skb);
1770 skb_head = skb->data;
1773 skb_set_owner_r(copy_skb, sk);
1775 snaplen = po->rx_ring.frame_size - macoff;
1776 if ((int)snaplen < 0)
1780 spin_lock(&sk->sk_receive_queue.lock);
1781 h.raw = packet_current_rx_frame(po, skb,
1782 TP_STATUS_KERNEL, (macoff+snaplen));
1785 if (po->tp_version <= TPACKET_V2) {
1786 packet_increment_rx_head(po, &po->rx_ring);
1788 * LOSING will be reported till you read the stats,
1789 * because it's COR - Clear On Read.
1790 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1793 if (po->stats.stats1.tp_drops)
1794 status |= TP_STATUS_LOSING;
1796 po->stats.stats1.tp_packets++;
1798 status |= TP_STATUS_COPY;
1799 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1801 spin_unlock(&sk->sk_receive_queue.lock);
1803 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
1805 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
1806 getnstimeofday(&ts);
1808 status |= ts_status;
1810 switch (po->tp_version) {
1812 h.h1->tp_len = skb->len;
1813 h.h1->tp_snaplen = snaplen;
1814 h.h1->tp_mac = macoff;
1815 h.h1->tp_net = netoff;
1816 h.h1->tp_sec = ts.tv_sec;
1817 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
1818 hdrlen = sizeof(*h.h1);
1821 h.h2->tp_len = skb->len;
1822 h.h2->tp_snaplen = snaplen;
1823 h.h2->tp_mac = macoff;
1824 h.h2->tp_net = netoff;
1825 h.h2->tp_sec = ts.tv_sec;
1826 h.h2->tp_nsec = ts.tv_nsec;
1827 if (vlan_tx_tag_present(skb)) {
1828 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1829 status |= TP_STATUS_VLAN_VALID;
1831 h.h2->tp_vlan_tci = 0;
1833 h.h2->tp_padding = 0;
1834 hdrlen = sizeof(*h.h2);
1837 /* tp_nxt_offset,vlan are already populated above.
1838 * So DONT clear those fields here
1840 h.h3->tp_status |= status;
1841 h.h3->tp_len = skb->len;
1842 h.h3->tp_snaplen = snaplen;
1843 h.h3->tp_mac = macoff;
1844 h.h3->tp_net = netoff;
1845 h.h3->tp_sec = ts.tv_sec;
1846 h.h3->tp_nsec = ts.tv_nsec;
1847 hdrlen = sizeof(*h.h3);
1853 sll = h.raw + TPACKET_ALIGN(hdrlen);
1854 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1855 sll->sll_family = AF_PACKET;
1856 sll->sll_hatype = dev->type;
1857 sll->sll_protocol = skb->protocol;
1858 sll->sll_pkttype = skb->pkt_type;
1859 if (unlikely(po->origdev))
1860 sll->sll_ifindex = orig_dev->ifindex;
1862 sll->sll_ifindex = dev->ifindex;
1865 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1869 if (po->tp_version <= TPACKET_V2) {
1870 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1871 + macoff + snaplen);
1872 for (start = h.raw; start < end; start += PAGE_SIZE)
1873 flush_dcache_page(pgv_to_page(start));
1878 if (po->tp_version <= TPACKET_V2)
1879 __packet_set_status(po, h.raw, status);
1881 prb_clear_blk_fill_status(&po->rx_ring);
1883 sk->sk_data_ready(sk, 0);
1886 if (skb_head != skb->data && skb_shared(skb)) {
1887 skb->data = skb_head;
1895 po->stats.stats1.tp_drops++;
1896 spin_unlock(&sk->sk_receive_queue.lock);
1898 sk->sk_data_ready(sk, 0);
1899 kfree_skb(copy_skb);
1900 goto drop_n_restore;
1903 static void tpacket_destruct_skb(struct sk_buff *skb)
1905 struct packet_sock *po = pkt_sk(skb->sk);
1908 if (likely(po->tx_ring.pg_vec)) {
1911 ph = skb_shinfo(skb)->destructor_arg;
1912 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1913 atomic_dec(&po->tx_ring.pending);
1915 ts = __packet_set_timestamp(po, ph, skb);
1916 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
1922 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1923 void *frame, struct net_device *dev, int size_max,
1924 __be16 proto, unsigned char *addr, int hlen)
1926 union tpacket_uhdr ph;
1927 int to_write, offset, len, tp_len, nr_frags, len_max, max_frame_len;
1928 struct socket *sock = po->sk.sk_socket;
1935 skb->protocol = proto;
1937 skb->priority = po->sk.sk_priority;
1938 skb->mark = po->sk.sk_mark;
1939 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
1940 skb_shinfo(skb)->destructor_arg = ph.raw;
1942 switch (po->tp_version) {
1944 tp_len = ph.h2->tp_len;
1947 tp_len = ph.h1->tp_len;
1951 skb_reserve(skb, hlen);
1952 skb_reset_network_header(skb);
1953 skb_probe_transport_header(skb, 0);
1955 if (po->tp_tx_has_off) {
1956 int off_min, off_max, off;
1957 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
1958 off_max = po->tx_ring.frame_size - tp_len;
1959 if (sock->type == SOCK_DGRAM) {
1960 switch (po->tp_version) {
1962 off = ph.h2->tp_net;
1965 off = ph.h1->tp_net;
1969 switch (po->tp_version) {
1971 off = ph.h2->tp_mac;
1974 off = ph.h1->tp_mac;
1978 if (unlikely((off < off_min) || (off_max < off)))
1980 data = ph.raw + off;
1982 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
1986 if (sock->type == SOCK_DGRAM) {
1987 err = dev_hard_header(skb, dev, ntohs(proto), addr,
1989 if (unlikely(err < 0))
1991 } else if (dev->hard_header_len) {
1992 /* net device doesn't like empty head */
1993 if (unlikely(tp_len <= dev->hard_header_len)) {
1994 pr_err("packet size is too short (%d < %d)\n",
1995 tp_len, dev->hard_header_len);
1999 skb_push(skb, dev->hard_header_len);
2000 err = skb_store_bits(skb, 0, data,
2001 dev->hard_header_len);
2005 if (dev->type == ARPHRD_ETHER)
2006 skb->protocol = eth_type_trans(skb, dev);
2008 data += dev->hard_header_len;
2009 to_write -= dev->hard_header_len;
2012 max_frame_len = dev->mtu + dev->hard_header_len;
2013 if (skb->protocol == htons(ETH_P_8021Q))
2014 max_frame_len += VLAN_HLEN;
2016 if (size_max > max_frame_len)
2017 size_max = max_frame_len;
2019 if (unlikely(tp_len > size_max)) {
2020 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2024 offset = offset_in_page(data);
2025 len_max = PAGE_SIZE - offset;
2026 len = ((to_write > len_max) ? len_max : to_write);
2028 skb->data_len = to_write;
2029 skb->len += to_write;
2030 skb->truesize += to_write;
2031 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2033 while (likely(to_write)) {
2034 nr_frags = skb_shinfo(skb)->nr_frags;
2036 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
2037 pr_err("Packet exceed the number of skb frags(%lu)\n",
2042 page = pgv_to_page(data);
2044 flush_dcache_page(page);
2046 skb_fill_page_desc(skb, nr_frags, page, offset, len);
2049 len_max = PAGE_SIZE;
2050 len = ((to_write > len_max) ? len_max : to_write);
2056 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2058 struct sk_buff *skb;
2059 struct net_device *dev;
2061 bool need_rls_dev = false;
2064 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
2065 int tp_len, size_max;
2066 unsigned char *addr;
2068 int status = TP_STATUS_AVAILABLE;
2071 mutex_lock(&po->pg_vec_lock);
2073 if (saddr == NULL) {
2074 dev = po->prot_hook.dev;
2079 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2081 if (msg->msg_namelen < (saddr->sll_halen
2082 + offsetof(struct sockaddr_ll,
2085 proto = saddr->sll_protocol;
2086 addr = saddr->sll_addr;
2087 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2088 need_rls_dev = true;
2092 if (unlikely(dev == NULL))
2096 if (unlikely(!(dev->flags & IFF_UP)))
2099 size_max = po->tx_ring.frame_size
2100 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
2103 ph = packet_current_frame(po, &po->tx_ring,
2104 TP_STATUS_SEND_REQUEST);
2106 if (unlikely(ph == NULL)) {
2111 status = TP_STATUS_SEND_REQUEST;
2112 hlen = LL_RESERVED_SPACE(dev);
2113 tlen = dev->needed_tailroom;
2114 skb = sock_alloc_send_skb(&po->sk,
2115 hlen + tlen + sizeof(struct sockaddr_ll),
2118 if (unlikely(skb == NULL))
2121 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
2124 if (unlikely(tp_len < 0)) {
2126 __packet_set_status(po, ph,
2127 TP_STATUS_AVAILABLE);
2128 packet_increment_head(&po->tx_ring);
2132 status = TP_STATUS_WRONG_FORMAT;
2138 skb->destructor = tpacket_destruct_skb;
2139 __packet_set_status(po, ph, TP_STATUS_SENDING);
2140 atomic_inc(&po->tx_ring.pending);
2142 status = TP_STATUS_SEND_REQUEST;
2143 err = dev_queue_xmit(skb);
2144 if (unlikely(err > 0)) {
2145 err = net_xmit_errno(err);
2146 if (err && __packet_get_status(po, ph) ==
2147 TP_STATUS_AVAILABLE) {
2148 /* skb was destructed already */
2153 * skb was dropped but not destructed yet;
2154 * let's treat it like congestion or err < 0
2158 packet_increment_head(&po->tx_ring);
2160 } while (likely((ph != NULL) ||
2161 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2162 (atomic_read(&po->tx_ring.pending))))
2169 __packet_set_status(po, ph, status);
2175 mutex_unlock(&po->pg_vec_lock);
2179 static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2180 size_t reserve, size_t len,
2181 size_t linear, int noblock,
2184 struct sk_buff *skb;
2186 /* Under a page? Don't bother with paged skb. */
2187 if (prepad + len < PAGE_SIZE || !linear)
2190 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2195 skb_reserve(skb, reserve);
2196 skb_put(skb, linear);
2197 skb->data_len = len - linear;
2198 skb->len += len - linear;
2203 static int packet_snd(struct socket *sock,
2204 struct msghdr *msg, size_t len)
2206 struct sock *sk = sock->sk;
2207 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
2208 struct sk_buff *skb;
2209 struct net_device *dev;
2211 bool need_rls_dev = false;
2212 unsigned char *addr;
2213 int err, reserve = 0;
2214 struct virtio_net_hdr vnet_hdr = { 0 };
2217 struct packet_sock *po = pkt_sk(sk);
2218 unsigned short gso_type = 0;
2223 * Get and verify the address.
2226 if (saddr == NULL) {
2227 dev = po->prot_hook.dev;
2232 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2234 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2236 proto = saddr->sll_protocol;
2237 addr = saddr->sll_addr;
2238 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2239 need_rls_dev = true;
2245 if (sock->type == SOCK_RAW)
2246 reserve = dev->hard_header_len;
2249 if (!(dev->flags & IFF_UP))
2252 if (po->has_vnet_hdr) {
2253 vnet_hdr_len = sizeof(vnet_hdr);
2256 if (len < vnet_hdr_len)
2259 len -= vnet_hdr_len;
2261 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2266 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2267 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2269 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2270 vnet_hdr.csum_offset + 2;
2273 if (vnet_hdr.hdr_len > len)
2276 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2277 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2278 case VIRTIO_NET_HDR_GSO_TCPV4:
2279 gso_type = SKB_GSO_TCPV4;
2281 case VIRTIO_NET_HDR_GSO_TCPV6:
2282 gso_type = SKB_GSO_TCPV6;
2284 case VIRTIO_NET_HDR_GSO_UDP:
2285 gso_type = SKB_GSO_UDP;
2291 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2292 gso_type |= SKB_GSO_TCP_ECN;
2294 if (vnet_hdr.gso_size == 0)
2300 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2301 if (!netif_supports_nofcs(dev)) {
2302 err = -EPROTONOSUPPORT;
2305 extra_len = 4; /* We're doing our own CRC */
2309 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
2313 hlen = LL_RESERVED_SPACE(dev);
2314 tlen = dev->needed_tailroom;
2315 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
2316 msg->msg_flags & MSG_DONTWAIT, &err);
2320 skb_set_network_header(skb, reserve);
2323 if (sock->type == SOCK_DGRAM &&
2324 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
2327 /* Returns -EFAULT on error */
2328 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
2332 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
2334 if (dev->type == ARPHRD_ETHER) {
2335 skb->protocol = eth_type_trans(skb, dev);
2336 if (skb->protocol == htons(ETH_P_8021Q))
2337 reserve += VLAN_HLEN;
2339 skb->protocol = proto;
2343 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
2348 skb->priority = sk->sk_priority;
2349 skb->mark = sk->sk_mark;
2351 if (po->has_vnet_hdr) {
2352 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2353 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2354 vnet_hdr.csum_offset)) {
2360 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2361 skb_shinfo(skb)->gso_type = gso_type;
2363 /* Header must be checked, and gso_segs computed. */
2364 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2365 skb_shinfo(skb)->gso_segs = 0;
2367 len += vnet_hdr_len;
2370 skb_probe_transport_header(skb, reserve);
2372 if (unlikely(extra_len == 4))
2379 err = dev_queue_xmit(skb);
2380 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2391 if (dev && need_rls_dev)
2397 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2398 struct msghdr *msg, size_t len)
2400 struct sock *sk = sock->sk;
2401 struct packet_sock *po = pkt_sk(sk);
2402 if (po->tx_ring.pg_vec)
2403 return tpacket_snd(po, msg);
2405 return packet_snd(sock, msg, len);
2409 * Close a PACKET socket. This is fairly simple. We immediately go
2410 * to 'closed' state and remove our protocol entry in the device list.
2413 static int packet_release(struct socket *sock)
2415 struct sock *sk = sock->sk;
2416 struct packet_sock *po;
2418 union tpacket_req_u req_u;
2426 mutex_lock(&net->packet.sklist_lock);
2427 sk_del_node_init_rcu(sk);
2428 mutex_unlock(&net->packet.sklist_lock);
2431 sock_prot_inuse_add(net, sk->sk_prot, -1);
2434 spin_lock(&po->bind_lock);
2435 unregister_prot_hook(sk, false);
2436 if (po->prot_hook.dev) {
2437 dev_put(po->prot_hook.dev);
2438 po->prot_hook.dev = NULL;
2440 spin_unlock(&po->bind_lock);
2442 packet_flush_mclist(sk);
2444 if (po->rx_ring.pg_vec) {
2445 memset(&req_u, 0, sizeof(req_u));
2446 packet_set_ring(sk, &req_u, 1, 0);
2449 if (po->tx_ring.pg_vec) {
2450 memset(&req_u, 0, sizeof(req_u));
2451 packet_set_ring(sk, &req_u, 1, 1);
2458 * Now the socket is dead. No more input will appear.
2465 skb_queue_purge(&sk->sk_receive_queue);
2466 sk_refcnt_debug_release(sk);
2473 * Attach a packet hook.
2476 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
2478 struct packet_sock *po = pkt_sk(sk);
2489 spin_lock(&po->bind_lock);
2490 unregister_prot_hook(sk, true);
2492 po->prot_hook.type = protocol;
2493 if (po->prot_hook.dev)
2494 dev_put(po->prot_hook.dev);
2495 po->prot_hook.dev = dev;
2497 po->ifindex = dev ? dev->ifindex : 0;
2502 if (!dev || (dev->flags & IFF_UP)) {
2503 register_prot_hook(sk);
2505 sk->sk_err = ENETDOWN;
2506 if (!sock_flag(sk, SOCK_DEAD))
2507 sk->sk_error_report(sk);
2511 spin_unlock(&po->bind_lock);
2517 * Bind a packet socket to a device
2520 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2523 struct sock *sk = sock->sk;
2525 struct net_device *dev;
2532 if (addr_len != sizeof(struct sockaddr))
2534 strlcpy(name, uaddr->sa_data, sizeof(name));
2536 dev = dev_get_by_name(sock_net(sk), name);
2538 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
2542 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2544 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2545 struct sock *sk = sock->sk;
2546 struct net_device *dev = NULL;
2554 if (addr_len < sizeof(struct sockaddr_ll))
2556 if (sll->sll_family != AF_PACKET)
2559 if (sll->sll_ifindex) {
2561 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
2565 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
2571 static struct proto packet_proto = {
2573 .owner = THIS_MODULE,
2574 .obj_size = sizeof(struct packet_sock),
2578 * Create a packet of type SOCK_PACKET.
2581 static int packet_create(struct net *net, struct socket *sock, int protocol,
2585 struct packet_sock *po;
2586 __be16 proto = (__force __be16)protocol; /* weird, but documented */
2589 if (!ns_capable(net->user_ns, CAP_NET_RAW))
2591 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2592 sock->type != SOCK_PACKET)
2593 return -ESOCKTNOSUPPORT;
2595 sock->state = SS_UNCONNECTED;
2598 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
2602 sock->ops = &packet_ops;
2603 if (sock->type == SOCK_PACKET)
2604 sock->ops = &packet_ops_spkt;
2606 sock_init_data(sock, sk);
2609 sk->sk_family = PF_PACKET;
2612 sk->sk_destruct = packet_sock_destruct;
2613 sk_refcnt_debug_inc(sk);
2616 * Attach a protocol block
2619 spin_lock_init(&po->bind_lock);
2620 mutex_init(&po->pg_vec_lock);
2621 po->prot_hook.func = packet_rcv;
2623 if (sock->type == SOCK_PACKET)
2624 po->prot_hook.func = packet_rcv_spkt;
2626 po->prot_hook.af_packet_priv = sk;
2629 po->prot_hook.type = proto;
2630 register_prot_hook(sk);
2633 mutex_lock(&net->packet.sklist_lock);
2634 sk_add_node_rcu(sk, &net->packet.sklist);
2635 mutex_unlock(&net->packet.sklist_lock);
2638 sock_prot_inuse_add(net, &packet_proto, 1);
2647 * Pull a packet from our receive queue and hand it to the user.
2648 * If necessary we block.
2651 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2652 struct msghdr *msg, size_t len, int flags)
2654 struct sock *sk = sock->sk;
2655 struct sk_buff *skb;
2657 struct sockaddr_ll *sll;
2658 int vnet_hdr_len = 0;
2661 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
2665 /* What error should we return now? EUNATTACH? */
2666 if (pkt_sk(sk)->ifindex < 0)
2670 if (flags & MSG_ERRQUEUE) {
2671 err = sock_recv_errqueue(sk, msg, len,
2672 SOL_PACKET, PACKET_TX_TIMESTAMP);
2677 * Call the generic datagram receiver. This handles all sorts
2678 * of horrible races and re-entrancy so we can forget about it
2679 * in the protocol layers.
2681 * Now it will return ENETDOWN, if device have just gone down,
2682 * but then it will block.
2685 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
2688 * An error occurred so return it. Because skb_recv_datagram()
2689 * handles the blocking we don't see and worry about blocking
2696 if (pkt_sk(sk)->has_vnet_hdr) {
2697 struct virtio_net_hdr vnet_hdr = { 0 };
2700 vnet_hdr_len = sizeof(vnet_hdr);
2701 if (len < vnet_hdr_len)
2704 len -= vnet_hdr_len;
2706 if (skb_is_gso(skb)) {
2707 struct skb_shared_info *sinfo = skb_shinfo(skb);
2709 /* This is a hint as to how much should be linear. */
2710 vnet_hdr.hdr_len = skb_headlen(skb);
2711 vnet_hdr.gso_size = sinfo->gso_size;
2712 if (sinfo->gso_type & SKB_GSO_TCPV4)
2713 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2714 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2715 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2716 else if (sinfo->gso_type & SKB_GSO_UDP)
2717 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2718 else if (sinfo->gso_type & SKB_GSO_FCOE)
2722 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2723 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2725 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2727 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2728 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
2729 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
2730 vnet_hdr.csum_offset = skb->csum_offset;
2731 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2732 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
2733 } /* else everything is zero */
2735 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2742 * If the address length field is there to be filled in, we fill
2746 sll = &PACKET_SKB_CB(skb)->sa.ll;
2747 if (sock->type == SOCK_PACKET)
2748 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2750 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
2753 * You lose any data beyond the buffer you gave. If it worries a
2754 * user program they can ask the device for its MTU anyway.
2760 msg->msg_flags |= MSG_TRUNC;
2763 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2767 sock_recv_ts_and_drops(msg, sk, skb);
2770 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2773 if (pkt_sk(sk)->auxdata) {
2774 struct tpacket_auxdata aux;
2776 aux.tp_status = TP_STATUS_USER;
2777 if (skb->ip_summed == CHECKSUM_PARTIAL)
2778 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2779 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2780 aux.tp_snaplen = skb->len;
2782 aux.tp_net = skb_network_offset(skb);
2783 if (vlan_tx_tag_present(skb)) {
2784 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2785 aux.tp_status |= TP_STATUS_VLAN_VALID;
2787 aux.tp_vlan_tci = 0;
2790 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
2794 * Free or return the buffer as appropriate. Again this
2795 * hides all the races and re-entrancy issues from us.
2797 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
2800 skb_free_datagram(sk, skb);
2805 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2806 int *uaddr_len, int peer)
2808 struct net_device *dev;
2809 struct sock *sk = sock->sk;
2814 uaddr->sa_family = AF_PACKET;
2815 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
2817 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2819 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
2821 *uaddr_len = sizeof(*uaddr);
2826 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2827 int *uaddr_len, int peer)
2829 struct net_device *dev;
2830 struct sock *sk = sock->sk;
2831 struct packet_sock *po = pkt_sk(sk);
2832 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
2837 sll->sll_family = AF_PACKET;
2838 sll->sll_ifindex = po->ifindex;
2839 sll->sll_protocol = po->num;
2840 sll->sll_pkttype = 0;
2842 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
2844 sll->sll_hatype = dev->type;
2845 sll->sll_halen = dev->addr_len;
2846 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
2848 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2852 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
2857 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2861 case PACKET_MR_MULTICAST:
2862 if (i->alen != dev->addr_len)
2865 return dev_mc_add(dev, i->addr);
2867 return dev_mc_del(dev, i->addr);
2869 case PACKET_MR_PROMISC:
2870 return dev_set_promiscuity(dev, what);
2872 case PACKET_MR_ALLMULTI:
2873 return dev_set_allmulti(dev, what);
2875 case PACKET_MR_UNICAST:
2876 if (i->alen != dev->addr_len)
2879 return dev_uc_add(dev, i->addr);
2881 return dev_uc_del(dev, i->addr);
2889 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2891 for ( ; i; i = i->next) {
2892 if (i->ifindex == dev->ifindex)
2893 packet_dev_mc(dev, i, what);
2897 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
2899 struct packet_sock *po = pkt_sk(sk);
2900 struct packet_mclist *ml, *i;
2901 struct net_device *dev;
2907 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
2912 if (mreq->mr_alen > dev->addr_len)
2916 i = kmalloc(sizeof(*i), GFP_KERNEL);
2921 for (ml = po->mclist; ml; ml = ml->next) {
2922 if (ml->ifindex == mreq->mr_ifindex &&
2923 ml->type == mreq->mr_type &&
2924 ml->alen == mreq->mr_alen &&
2925 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2927 /* Free the new element ... */
2933 i->type = mreq->mr_type;
2934 i->ifindex = mreq->mr_ifindex;
2935 i->alen = mreq->mr_alen;
2936 memcpy(i->addr, mreq->mr_address, i->alen);
2938 i->next = po->mclist;
2940 err = packet_dev_mc(dev, i, 1);
2942 po->mclist = i->next;
2951 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
2953 struct packet_mclist *ml, **mlp;
2957 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
2958 if (ml->ifindex == mreq->mr_ifindex &&
2959 ml->type == mreq->mr_type &&
2960 ml->alen == mreq->mr_alen &&
2961 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2962 if (--ml->count == 0) {
2963 struct net_device *dev;
2965 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
2967 packet_dev_mc(dev, ml, -1);
2975 return -EADDRNOTAVAIL;
2978 static void packet_flush_mclist(struct sock *sk)
2980 struct packet_sock *po = pkt_sk(sk);
2981 struct packet_mclist *ml;
2987 while ((ml = po->mclist) != NULL) {
2988 struct net_device *dev;
2990 po->mclist = ml->next;
2991 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
2993 packet_dev_mc(dev, ml, -1);
3000 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
3002 struct sock *sk = sock->sk;
3003 struct packet_sock *po = pkt_sk(sk);
3006 if (level != SOL_PACKET)
3007 return -ENOPROTOOPT;
3010 case PACKET_ADD_MEMBERSHIP:
3011 case PACKET_DROP_MEMBERSHIP:
3013 struct packet_mreq_max mreq;
3015 memset(&mreq, 0, sizeof(mreq));
3016 if (len < sizeof(struct packet_mreq))
3018 if (len > sizeof(mreq))
3020 if (copy_from_user(&mreq, optval, len))
3022 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3024 if (optname == PACKET_ADD_MEMBERSHIP)
3025 ret = packet_mc_add(sk, &mreq);
3027 ret = packet_mc_drop(sk, &mreq);
3031 case PACKET_RX_RING:
3032 case PACKET_TX_RING:
3034 union tpacket_req_u req_u;
3037 switch (po->tp_version) {
3040 len = sizeof(req_u.req);
3044 len = sizeof(req_u.req3);
3049 if (pkt_sk(sk)->has_vnet_hdr)
3051 if (copy_from_user(&req_u.req, optval, len))
3053 return packet_set_ring(sk, &req_u, 0,
3054 optname == PACKET_TX_RING);
3056 case PACKET_COPY_THRESH:
3060 if (optlen != sizeof(val))
3062 if (copy_from_user(&val, optval, sizeof(val)))
3065 pkt_sk(sk)->copy_thresh = val;
3068 case PACKET_VERSION:
3072 if (optlen != sizeof(val))
3074 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3076 if (copy_from_user(&val, optval, sizeof(val)))
3082 po->tp_version = val;
3088 case PACKET_RESERVE:
3092 if (optlen != sizeof(val))
3094 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3096 if (copy_from_user(&val, optval, sizeof(val)))
3098 po->tp_reserve = val;
3105 if (optlen != sizeof(val))
3107 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3109 if (copy_from_user(&val, optval, sizeof(val)))
3111 po->tp_loss = !!val;
3114 case PACKET_AUXDATA:
3118 if (optlen < sizeof(val))
3120 if (copy_from_user(&val, optval, sizeof(val)))
3123 po->auxdata = !!val;
3126 case PACKET_ORIGDEV:
3130 if (optlen < sizeof(val))
3132 if (copy_from_user(&val, optval, sizeof(val)))
3135 po->origdev = !!val;
3138 case PACKET_VNET_HDR:
3142 if (sock->type != SOCK_RAW)
3144 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3146 if (optlen < sizeof(val))
3148 if (copy_from_user(&val, optval, sizeof(val)))
3151 po->has_vnet_hdr = !!val;
3154 case PACKET_TIMESTAMP:
3158 if (optlen != sizeof(val))
3160 if (copy_from_user(&val, optval, sizeof(val)))
3163 po->tp_tstamp = val;
3170 if (optlen != sizeof(val))
3172 if (copy_from_user(&val, optval, sizeof(val)))
3175 return fanout_add(sk, val & 0xffff, val >> 16);
3177 case PACKET_TX_HAS_OFF:
3181 if (optlen != sizeof(val))
3183 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3185 if (copy_from_user(&val, optval, sizeof(val)))
3187 po->tp_tx_has_off = !!val;
3191 return -ENOPROTOOPT;
3195 static int packet_getsockopt(struct socket *sock, int level, int optname,
3196 char __user *optval, int __user *optlen)
3199 int val, lv = sizeof(val);
3200 struct sock *sk = sock->sk;
3201 struct packet_sock *po = pkt_sk(sk);
3203 union tpacket_stats_u st;
3205 if (level != SOL_PACKET)
3206 return -ENOPROTOOPT;
3208 if (get_user(len, optlen))
3215 case PACKET_STATISTICS:
3216 spin_lock_bh(&sk->sk_receive_queue.lock);
3217 memcpy(&st, &po->stats, sizeof(st));
3218 memset(&po->stats, 0, sizeof(po->stats));
3219 spin_unlock_bh(&sk->sk_receive_queue.lock);
3221 if (po->tp_version == TPACKET_V3) {
3222 lv = sizeof(struct tpacket_stats_v3);
3225 lv = sizeof(struct tpacket_stats);
3230 case PACKET_AUXDATA:
3233 case PACKET_ORIGDEV:
3236 case PACKET_VNET_HDR:
3237 val = po->has_vnet_hdr;
3239 case PACKET_VERSION:
3240 val = po->tp_version;
3243 if (len > sizeof(int))
3245 if (copy_from_user(&val, optval, len))
3249 val = sizeof(struct tpacket_hdr);
3252 val = sizeof(struct tpacket2_hdr);
3255 val = sizeof(struct tpacket3_hdr);
3261 case PACKET_RESERVE:
3262 val = po->tp_reserve;
3267 case PACKET_TIMESTAMP:
3268 val = po->tp_tstamp;
3272 ((u32)po->fanout->id |
3273 ((u32)po->fanout->type << 16) |
3274 ((u32)po->fanout->flags << 24)) :
3277 case PACKET_TX_HAS_OFF:
3278 val = po->tp_tx_has_off;
3281 return -ENOPROTOOPT;
3286 if (put_user(len, optlen))
3288 if (copy_to_user(optval, data, len))
3294 static int packet_notifier(struct notifier_block *this,
3295 unsigned long msg, void *ptr)
3298 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3299 struct net *net = dev_net(dev);
3302 sk_for_each_rcu(sk, &net->packet.sklist) {
3303 struct packet_sock *po = pkt_sk(sk);
3306 case NETDEV_UNREGISTER:
3308 packet_dev_mclist(dev, po->mclist, -1);
3312 if (dev->ifindex == po->ifindex) {
3313 spin_lock(&po->bind_lock);
3315 __unregister_prot_hook(sk, false);
3316 sk->sk_err = ENETDOWN;
3317 if (!sock_flag(sk, SOCK_DEAD))
3318 sk->sk_error_report(sk);
3320 if (msg == NETDEV_UNREGISTER) {
3322 if (po->prot_hook.dev)
3323 dev_put(po->prot_hook.dev);
3324 po->prot_hook.dev = NULL;
3326 spin_unlock(&po->bind_lock);
3330 if (dev->ifindex == po->ifindex) {
3331 spin_lock(&po->bind_lock);
3333 register_prot_hook(sk);
3334 spin_unlock(&po->bind_lock);
3344 static int packet_ioctl(struct socket *sock, unsigned int cmd,
3347 struct sock *sk = sock->sk;
3352 int amount = sk_wmem_alloc_get(sk);
3354 return put_user(amount, (int __user *)arg);
3358 struct sk_buff *skb;
3361 spin_lock_bh(&sk->sk_receive_queue.lock);
3362 skb = skb_peek(&sk->sk_receive_queue);
3365 spin_unlock_bh(&sk->sk_receive_queue.lock);
3366 return put_user(amount, (int __user *)arg);
3369 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3371 return sock_get_timestampns(sk, (struct timespec __user *)arg);
3381 case SIOCGIFBRDADDR:
3382 case SIOCSIFBRDADDR:
3383 case SIOCGIFNETMASK:
3384 case SIOCSIFNETMASK:
3385 case SIOCGIFDSTADDR:
3386 case SIOCSIFDSTADDR:
3388 return inet_dgram_ops.ioctl(sock, cmd, arg);
3392 return -ENOIOCTLCMD;
3397 static unsigned int packet_poll(struct file *file, struct socket *sock,
3400 struct sock *sk = sock->sk;
3401 struct packet_sock *po = pkt_sk(sk);
3402 unsigned int mask = datagram_poll(file, sock, wait);
3404 spin_lock_bh(&sk->sk_receive_queue.lock);
3405 if (po->rx_ring.pg_vec) {
3406 if (!packet_previous_rx_frame(po, &po->rx_ring,
3408 mask |= POLLIN | POLLRDNORM;
3410 spin_unlock_bh(&sk->sk_receive_queue.lock);
3411 spin_lock_bh(&sk->sk_write_queue.lock);
3412 if (po->tx_ring.pg_vec) {
3413 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3414 mask |= POLLOUT | POLLWRNORM;
3416 spin_unlock_bh(&sk->sk_write_queue.lock);
3421 /* Dirty? Well, I still did not learn better way to account
3425 static void packet_mm_open(struct vm_area_struct *vma)
3427 struct file *file = vma->vm_file;
3428 struct socket *sock = file->private_data;
3429 struct sock *sk = sock->sk;
3432 atomic_inc(&pkt_sk(sk)->mapped);
3435 static void packet_mm_close(struct vm_area_struct *vma)
3437 struct file *file = vma->vm_file;
3438 struct socket *sock = file->private_data;
3439 struct sock *sk = sock->sk;
3442 atomic_dec(&pkt_sk(sk)->mapped);
3445 static const struct vm_operations_struct packet_mmap_ops = {
3446 .open = packet_mm_open,
3447 .close = packet_mm_close,
3450 static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3455 for (i = 0; i < len; i++) {
3456 if (likely(pg_vec[i].buffer)) {
3457 if (is_vmalloc_addr(pg_vec[i].buffer))
3458 vfree(pg_vec[i].buffer);
3460 free_pages((unsigned long)pg_vec[i].buffer,
3462 pg_vec[i].buffer = NULL;
3468 static char *alloc_one_pg_vec_page(unsigned long order)
3470 char *buffer = NULL;
3471 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3472 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3474 buffer = (char *) __get_free_pages(gfp_flags, order);
3480 * __get_free_pages failed, fall back to vmalloc
3482 buffer = vzalloc((1 << order) * PAGE_SIZE);
3488 * vmalloc failed, lets dig into swap here
3490 gfp_flags &= ~__GFP_NORETRY;
3491 buffer = (char *)__get_free_pages(gfp_flags, order);
3496 * complete and utter failure
3501 static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
3503 unsigned int block_nr = req->tp_block_nr;
3507 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
3508 if (unlikely(!pg_vec))
3511 for (i = 0; i < block_nr; i++) {
3512 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
3513 if (unlikely(!pg_vec[i].buffer))
3514 goto out_free_pgvec;
3521 free_pg_vec(pg_vec, order, block_nr);
3526 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
3527 int closing, int tx_ring)
3529 struct pgv *pg_vec = NULL;
3530 struct packet_sock *po = pkt_sk(sk);
3531 int was_running, order = 0;
3532 struct packet_ring_buffer *rb;
3533 struct sk_buff_head *rb_queue;
3536 /* Added to avoid minimal code churn */
3537 struct tpacket_req *req = &req_u->req;
3539 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3540 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3541 WARN(1, "Tx-ring is not supported.\n");
3545 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3546 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
3550 if (atomic_read(&po->mapped))
3552 if (atomic_read(&rb->pending))
3556 if (req->tp_block_nr) {
3557 /* Sanity tests and some calculations */
3559 if (unlikely(rb->pg_vec))
3562 switch (po->tp_version) {
3564 po->tp_hdrlen = TPACKET_HDRLEN;
3567 po->tp_hdrlen = TPACKET2_HDRLEN;
3570 po->tp_hdrlen = TPACKET3_HDRLEN;
3575 if (unlikely((int)req->tp_block_size <= 0))
3577 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
3579 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
3582 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
3585 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3586 if (unlikely(rb->frames_per_block <= 0))
3588 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3593 order = get_order(req->tp_block_size);
3594 pg_vec = alloc_pg_vec(req, order);
3595 if (unlikely(!pg_vec))
3597 switch (po->tp_version) {
3599 /* Transmit path is not supported. We checked
3600 * it above but just being paranoid
3603 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3612 if (unlikely(req->tp_frame_nr))
3618 /* Detach socket from network */
3619 spin_lock(&po->bind_lock);
3620 was_running = po->running;
3624 __unregister_prot_hook(sk, false);
3626 spin_unlock(&po->bind_lock);
3631 mutex_lock(&po->pg_vec_lock);
3632 if (closing || atomic_read(&po->mapped) == 0) {
3634 spin_lock_bh(&rb_queue->lock);
3635 swap(rb->pg_vec, pg_vec);
3636 rb->frame_max = (req->tp_frame_nr - 1);
3638 rb->frame_size = req->tp_frame_size;
3639 spin_unlock_bh(&rb_queue->lock);
3641 swap(rb->pg_vec_order, order);
3642 swap(rb->pg_vec_len, req->tp_block_nr);
3644 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3645 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3646 tpacket_rcv : packet_rcv;
3647 skb_queue_purge(rb_queue);
3648 if (atomic_read(&po->mapped))
3649 pr_err("packet_mmap: vma is busy: %d\n",
3650 atomic_read(&po->mapped));
3652 mutex_unlock(&po->pg_vec_lock);
3654 spin_lock(&po->bind_lock);
3657 register_prot_hook(sk);
3659 spin_unlock(&po->bind_lock);
3660 if (closing && (po->tp_version > TPACKET_V2)) {
3661 /* Because we don't support block-based V3 on tx-ring */
3663 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3668 free_pg_vec(pg_vec, order, req->tp_block_nr);
3673 static int packet_mmap(struct file *file, struct socket *sock,
3674 struct vm_area_struct *vma)
3676 struct sock *sk = sock->sk;
3677 struct packet_sock *po = pkt_sk(sk);
3678 unsigned long size, expected_size;
3679 struct packet_ring_buffer *rb;
3680 unsigned long start;
3687 mutex_lock(&po->pg_vec_lock);
3690 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3692 expected_size += rb->pg_vec_len
3698 if (expected_size == 0)
3701 size = vma->vm_end - vma->vm_start;
3702 if (size != expected_size)
3705 start = vma->vm_start;
3706 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3707 if (rb->pg_vec == NULL)
3710 for (i = 0; i < rb->pg_vec_len; i++) {
3712 void *kaddr = rb->pg_vec[i].buffer;
3715 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3716 page = pgv_to_page(kaddr);
3717 err = vm_insert_page(vma, start, page);
3726 atomic_inc(&po->mapped);
3727 vma->vm_ops = &packet_mmap_ops;
3731 mutex_unlock(&po->pg_vec_lock);
3735 static const struct proto_ops packet_ops_spkt = {
3736 .family = PF_PACKET,
3737 .owner = THIS_MODULE,
3738 .release = packet_release,
3739 .bind = packet_bind_spkt,
3740 .connect = sock_no_connect,
3741 .socketpair = sock_no_socketpair,
3742 .accept = sock_no_accept,
3743 .getname = packet_getname_spkt,
3744 .poll = datagram_poll,
3745 .ioctl = packet_ioctl,
3746 .listen = sock_no_listen,
3747 .shutdown = sock_no_shutdown,
3748 .setsockopt = sock_no_setsockopt,
3749 .getsockopt = sock_no_getsockopt,
3750 .sendmsg = packet_sendmsg_spkt,
3751 .recvmsg = packet_recvmsg,
3752 .mmap = sock_no_mmap,
3753 .sendpage = sock_no_sendpage,
3756 static const struct proto_ops packet_ops = {
3757 .family = PF_PACKET,
3758 .owner = THIS_MODULE,
3759 .release = packet_release,
3760 .bind = packet_bind,
3761 .connect = sock_no_connect,
3762 .socketpair = sock_no_socketpair,
3763 .accept = sock_no_accept,
3764 .getname = packet_getname,
3765 .poll = packet_poll,
3766 .ioctl = packet_ioctl,
3767 .listen = sock_no_listen,
3768 .shutdown = sock_no_shutdown,
3769 .setsockopt = packet_setsockopt,
3770 .getsockopt = packet_getsockopt,
3771 .sendmsg = packet_sendmsg,
3772 .recvmsg = packet_recvmsg,
3773 .mmap = packet_mmap,
3774 .sendpage = sock_no_sendpage,
3777 static const struct net_proto_family packet_family_ops = {
3778 .family = PF_PACKET,
3779 .create = packet_create,
3780 .owner = THIS_MODULE,
3783 static struct notifier_block packet_netdev_notifier = {
3784 .notifier_call = packet_notifier,
3787 #ifdef CONFIG_PROC_FS
3789 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
3792 struct net *net = seq_file_net(seq);
3795 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
3798 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3800 struct net *net = seq_file_net(seq);
3801 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
3804 static void packet_seq_stop(struct seq_file *seq, void *v)
3810 static int packet_seq_show(struct seq_file *seq, void *v)
3812 if (v == SEQ_START_TOKEN)
3813 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3815 struct sock *s = sk_entry(v);
3816 const struct packet_sock *po = pkt_sk(s);
3819 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
3821 atomic_read(&s->sk_refcnt),
3826 atomic_read(&s->sk_rmem_alloc),
3827 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
3834 static const struct seq_operations packet_seq_ops = {
3835 .start = packet_seq_start,
3836 .next = packet_seq_next,
3837 .stop = packet_seq_stop,
3838 .show = packet_seq_show,
3841 static int packet_seq_open(struct inode *inode, struct file *file)
3843 return seq_open_net(inode, file, &packet_seq_ops,
3844 sizeof(struct seq_net_private));
3847 static const struct file_operations packet_seq_fops = {
3848 .owner = THIS_MODULE,
3849 .open = packet_seq_open,
3851 .llseek = seq_lseek,
3852 .release = seq_release_net,
3857 static int __net_init packet_net_init(struct net *net)
3859 mutex_init(&net->packet.sklist_lock);
3860 INIT_HLIST_HEAD(&net->packet.sklist);
3862 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
3868 static void __net_exit packet_net_exit(struct net *net)
3870 remove_proc_entry("packet", net->proc_net);
3873 static struct pernet_operations packet_net_ops = {
3874 .init = packet_net_init,
3875 .exit = packet_net_exit,
3879 static void __exit packet_exit(void)
3881 unregister_netdevice_notifier(&packet_netdev_notifier);
3882 unregister_pernet_subsys(&packet_net_ops);
3883 sock_unregister(PF_PACKET);
3884 proto_unregister(&packet_proto);
3887 static int __init packet_init(void)
3889 int rc = proto_register(&packet_proto, 0);
3894 sock_register(&packet_family_ops);
3895 register_pernet_subsys(&packet_net_ops);
3896 register_netdevice_notifier(&packet_netdev_notifier);
3901 module_init(packet_init);
3902 module_exit(packet_exit);
3903 MODULE_LICENSE("GPL");
3904 MODULE_ALIAS_NETPROTO(PF_PACKET);