2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
35 * Ulises Alonso : Frame number limit removal and
36 * packet_set_ring memory leak.
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll
42 * Johann Baudy : Added TX RING.
44 * This program is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU General Public License
46 * as published by the Free Software Foundation; either version
47 * 2 of the License, or (at your option) any later version.
51 #include <linux/types.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <net/net_namespace.h>
67 #include <net/protocol.h>
68 #include <linux/skbuff.h>
70 #include <linux/errno.h>
71 #include <linux/timer.h>
72 #include <asm/system.h>
73 #include <asm/uaccess.h>
74 #include <asm/ioctls.h>
76 #include <asm/cacheflush.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80 #include <linux/poll.h>
81 #include <linux/module.h>
82 #include <linux/init.h>
83 #include <linux/mutex.h>
84 #include <linux/if_vlan.h>
85 #include <linux/virtio_net.h>
86 #include <linux/errqueue.h>
87 #include <linux/net_tstamp.h>
90 #include <net/inet_common.h>
95 - if device has no dev->hard_header routine, it adds and removes ll header
96 inside itself. In this case ll header is invisible outside of device,
97 but higher levels still should reserve dev->hard_header_len.
98 Some devices are enough clever to reallocate skb, when header
99 will not fit to reserved space (tunnel), another ones are silly
101 - packet socket receives packets with pulled ll header,
102 so that SOCK_RAW should push it back.
107 Incoming, dev->hard_header!=NULL
108 mac_header -> ll header
111 Outgoing, dev->hard_header!=NULL
112 mac_header -> ll header
115 Incoming, dev->hard_header==NULL
116 mac_header -> UNKNOWN position. It is very likely, that it points to ll
117 header. PPP makes it, that is wrong, because introduce
118 assymetry between rx and tx paths.
121 Outgoing, dev->hard_header==NULL
122 mac_header -> data. ll header is still not built!
126 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
132 dev->hard_header != NULL
133 mac_header -> ll header
136 dev->hard_header == NULL (ll header is added by device, we cannot control it)
140 We should set nh.raw on output to correct posistion,
141 packet classifier depends on it.
144 /* Private packet socket structures. */
146 struct packet_mclist {
147 struct packet_mclist *next;
152 unsigned char addr[MAX_ADDR_LEN];
154 /* identical to struct packet_mreq except it has
155 * a longer address field.
157 struct packet_mreq_max {
159 unsigned short mr_type;
160 unsigned short mr_alen;
161 unsigned char mr_address[MAX_ADDR_LEN];
164 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
165 int closing, int tx_ring);
167 #define PGV_FROM_VMALLOC 1
173 struct packet_ring_buffer {
176 unsigned int frames_per_block;
177 unsigned int frame_size;
178 unsigned int frame_max;
180 unsigned int pg_vec_order;
181 unsigned int pg_vec_pages;
182 unsigned int pg_vec_len;
188 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
190 static void packet_flush_mclist(struct sock *sk);
193 /* struct sock has to be the first member of packet_sock */
195 struct tpacket_stats stats;
196 struct packet_ring_buffer rx_ring;
197 struct packet_ring_buffer tx_ring;
199 spinlock_t bind_lock;
200 struct mutex pg_vec_lock;
201 unsigned int running:1, /* prot_hook is attached*/
205 int ifindex; /* bound device */
207 struct packet_mclist *mclist;
209 enum tpacket_versions tp_version;
210 unsigned int tp_hdrlen;
211 unsigned int tp_reserve;
212 unsigned int tp_loss:1;
213 unsigned int tp_tstamp;
214 struct packet_type prot_hook ____cacheline_aligned_in_smp;
217 struct packet_skb_cb {
218 unsigned int origlen;
220 struct sockaddr_pkt pkt;
221 struct sockaddr_ll ll;
225 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
227 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
230 struct tpacket_hdr *h1;
231 struct tpacket2_hdr *h2;
236 switch (po->tp_version) {
238 h.h1->tp_status = status;
239 flush_dcache_page(virt_to_page(&h.h1->tp_status));
242 h.h2->tp_status = status;
243 flush_dcache_page(virt_to_page(&h.h2->tp_status));
246 pr_err("TPACKET version not supported\n");
253 static int __packet_get_status(struct packet_sock *po, void *frame)
256 struct tpacket_hdr *h1;
257 struct tpacket2_hdr *h2;
264 switch (po->tp_version) {
266 flush_dcache_page(virt_to_page(&h.h1->tp_status));
267 return h.h1->tp_status;
269 flush_dcache_page(virt_to_page(&h.h2->tp_status));
270 return h.h2->tp_status;
272 pr_err("TPACKET version not supported\n");
278 static void *packet_lookup_frame(struct packet_sock *po,
279 struct packet_ring_buffer *rb,
280 unsigned int position,
283 unsigned int pg_vec_pos, frame_offset;
285 struct tpacket_hdr *h1;
286 struct tpacket2_hdr *h2;
290 pg_vec_pos = position / rb->frames_per_block;
291 frame_offset = position % rb->frames_per_block;
293 h.raw = rb->pg_vec[pg_vec_pos].buffer +
294 (frame_offset * rb->frame_size);
296 if (status != __packet_get_status(po, h.raw))
302 static inline void *packet_current_frame(struct packet_sock *po,
303 struct packet_ring_buffer *rb,
306 return packet_lookup_frame(po, rb, rb->head, status);
309 static inline void *packet_previous_frame(struct packet_sock *po,
310 struct packet_ring_buffer *rb,
313 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
314 return packet_lookup_frame(po, rb, previous, status);
317 static inline void packet_increment_head(struct packet_ring_buffer *buff)
319 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
322 static inline struct packet_sock *pkt_sk(struct sock *sk)
324 return (struct packet_sock *)sk;
327 static void packet_sock_destruct(struct sock *sk)
329 skb_queue_purge(&sk->sk_error_queue);
331 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
332 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
334 if (!sock_flag(sk, SOCK_DEAD)) {
335 pr_err("Attempt to release alive packet socket: %p\n", sk);
339 sk_refcnt_debug_dec(sk);
343 static const struct proto_ops packet_ops;
345 static const struct proto_ops packet_ops_spkt;
347 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
348 struct packet_type *pt, struct net_device *orig_dev)
351 struct sockaddr_pkt *spkt;
354 * When we registered the protocol we saved the socket in the data
355 * field for just this event.
358 sk = pt->af_packet_priv;
361 * Yank back the headers [hope the device set this
362 * right or kerboom...]
364 * Incoming packets have ll header pulled,
367 * For outgoing ones skb->data == skb_mac_header(skb)
368 * so that this procedure is noop.
371 if (skb->pkt_type == PACKET_LOOPBACK)
374 if (!net_eq(dev_net(dev), sock_net(sk)))
377 skb = skb_share_check(skb, GFP_ATOMIC);
381 /* drop any routing info */
384 /* drop conntrack reference */
387 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
389 skb_push(skb, skb->data - skb_mac_header(skb));
392 * The SOCK_PACKET socket receives _all_ frames.
395 spkt->spkt_family = dev->type;
396 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
397 spkt->spkt_protocol = skb->protocol;
400 * Charge the memory to the socket. This is done specifically
401 * to prevent sockets using all the memory up.
404 if (sock_queue_rcv_skb(sk, skb) == 0)
415 * Output a raw packet to a device layer. This bypasses all the other
416 * protocol layers and you must therefore supply it with a complete frame
419 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
420 struct msghdr *msg, size_t len)
422 struct sock *sk = sock->sk;
423 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
424 struct sk_buff *skb = NULL;
425 struct net_device *dev;
430 * Get and verify the address.
434 if (msg->msg_namelen < sizeof(struct sockaddr))
436 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
437 proto = saddr->spkt_protocol;
439 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
442 * Find the device first to size check it
445 saddr->spkt_device[13] = 0;
448 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
454 if (!(dev->flags & IFF_UP))
458 * You may not queue a frame bigger than the mtu. This is the lowest level
459 * raw protocol and you must do your own fragmentation at this level.
463 if (len > dev->mtu + dev->hard_header_len)
467 size_t reserved = LL_RESERVED_SPACE(dev);
468 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
471 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
474 /* FIXME: Save some space for broken drivers that write a hard
475 * header at transmission time by themselves. PPP is the notable
476 * one here. This should really be fixed at the driver level.
478 skb_reserve(skb, reserved);
479 skb_reset_network_header(skb);
481 /* Try to align data part correctly */
486 skb_reset_network_header(skb);
488 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
495 skb->protocol = proto;
497 skb->priority = sk->sk_priority;
498 skb->mark = sk->sk_mark;
499 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
514 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
517 struct sk_filter *filter;
520 filter = rcu_dereference_bh(sk->sk_filter);
522 res = sk_run_filter(skb, filter->insns, filter->len);
523 rcu_read_unlock_bh();
529 This function makes lazy skb cloning in hope that most of packets
530 are discarded by BPF.
532 Note tricky part: we DO mangle shared skb! skb->data, skb->len
533 and skb->cb are mangled. It works because (and until) packets
534 falling here are owned by current CPU. Output packets are cloned
535 by dev_queue_xmit_nit(), input packets are processed by net_bh
536 sequencially, so that if we return skb to original state on exit,
537 we will not harm anyone.
540 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
541 struct packet_type *pt, struct net_device *orig_dev)
544 struct sockaddr_ll *sll;
545 struct packet_sock *po;
546 u8 *skb_head = skb->data;
547 int skb_len = skb->len;
548 unsigned int snaplen, res;
550 if (skb->pkt_type == PACKET_LOOPBACK)
553 sk = pt->af_packet_priv;
556 if (!net_eq(dev_net(dev), sock_net(sk)))
561 if (dev->header_ops) {
562 /* The device has an explicit notion of ll header,
563 exported to higher levels.
565 Otherwise, the device hides datails of it frame
566 structure, so that corresponding packet head
567 never delivered to user.
569 if (sk->sk_type != SOCK_DGRAM)
570 skb_push(skb, skb->data - skb_mac_header(skb));
571 else if (skb->pkt_type == PACKET_OUTGOING) {
572 /* Special case: outgoing packets have ll header at head */
573 skb_pull(skb, skb_network_offset(skb));
579 res = run_filter(skb, sk, snaplen);
585 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
586 (unsigned)sk->sk_rcvbuf)
589 if (skb_shared(skb)) {
590 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
594 if (skb_head != skb->data) {
595 skb->data = skb_head;
602 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
605 sll = &PACKET_SKB_CB(skb)->sa.ll;
606 sll->sll_family = AF_PACKET;
607 sll->sll_hatype = dev->type;
608 sll->sll_protocol = skb->protocol;
609 sll->sll_pkttype = skb->pkt_type;
610 if (unlikely(po->origdev))
611 sll->sll_ifindex = orig_dev->ifindex;
613 sll->sll_ifindex = dev->ifindex;
615 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
617 PACKET_SKB_CB(skb)->origlen = skb->len;
619 if (pskb_trim(skb, snaplen))
622 skb_set_owner_r(skb, sk);
626 /* drop conntrack reference */
629 spin_lock(&sk->sk_receive_queue.lock);
630 po->stats.tp_packets++;
631 skb->dropcount = atomic_read(&sk->sk_drops);
632 __skb_queue_tail(&sk->sk_receive_queue, skb);
633 spin_unlock(&sk->sk_receive_queue.lock);
634 sk->sk_data_ready(sk, skb->len);
638 po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
641 if (skb_head != skb->data && skb_shared(skb)) {
642 skb->data = skb_head;
650 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
651 struct packet_type *pt, struct net_device *orig_dev)
654 struct packet_sock *po;
655 struct sockaddr_ll *sll;
657 struct tpacket_hdr *h1;
658 struct tpacket2_hdr *h2;
661 u8 *skb_head = skb->data;
662 int skb_len = skb->len;
663 unsigned int snaplen, res;
664 unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
665 unsigned short macoff, netoff, hdrlen;
666 struct sk_buff *copy_skb = NULL;
669 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
671 if (skb->pkt_type == PACKET_LOOPBACK)
674 sk = pt->af_packet_priv;
677 if (!net_eq(dev_net(dev), sock_net(sk)))
680 if (dev->header_ops) {
681 if (sk->sk_type != SOCK_DGRAM)
682 skb_push(skb, skb->data - skb_mac_header(skb));
683 else if (skb->pkt_type == PACKET_OUTGOING) {
684 /* Special case: outgoing packets have ll header at head */
685 skb_pull(skb, skb_network_offset(skb));
689 if (skb->ip_summed == CHECKSUM_PARTIAL)
690 status |= TP_STATUS_CSUMNOTREADY;
694 res = run_filter(skb, sk, snaplen);
700 if (sk->sk_type == SOCK_DGRAM) {
701 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
704 unsigned maclen = skb_network_offset(skb);
705 netoff = TPACKET_ALIGN(po->tp_hdrlen +
706 (maclen < 16 ? 16 : maclen)) +
708 macoff = netoff - maclen;
711 if (macoff + snaplen > po->rx_ring.frame_size) {
712 if (po->copy_thresh &&
713 atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
714 (unsigned)sk->sk_rcvbuf) {
715 if (skb_shared(skb)) {
716 copy_skb = skb_clone(skb, GFP_ATOMIC);
718 copy_skb = skb_get(skb);
719 skb_head = skb->data;
722 skb_set_owner_r(copy_skb, sk);
724 snaplen = po->rx_ring.frame_size - macoff;
725 if ((int)snaplen < 0)
729 spin_lock(&sk->sk_receive_queue.lock);
730 h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
733 packet_increment_head(&po->rx_ring);
734 po->stats.tp_packets++;
736 status |= TP_STATUS_COPY;
737 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
739 if (!po->stats.tp_drops)
740 status &= ~TP_STATUS_LOSING;
741 spin_unlock(&sk->sk_receive_queue.lock);
743 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
745 switch (po->tp_version) {
747 h.h1->tp_len = skb->len;
748 h.h1->tp_snaplen = snaplen;
749 h.h1->tp_mac = macoff;
750 h.h1->tp_net = netoff;
751 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
752 && shhwtstamps->syststamp.tv64)
753 tv = ktime_to_timeval(shhwtstamps->syststamp);
754 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
755 && shhwtstamps->hwtstamp.tv64)
756 tv = ktime_to_timeval(shhwtstamps->hwtstamp);
757 else if (skb->tstamp.tv64)
758 tv = ktime_to_timeval(skb->tstamp);
760 do_gettimeofday(&tv);
761 h.h1->tp_sec = tv.tv_sec;
762 h.h1->tp_usec = tv.tv_usec;
763 hdrlen = sizeof(*h.h1);
766 h.h2->tp_len = skb->len;
767 h.h2->tp_snaplen = snaplen;
768 h.h2->tp_mac = macoff;
769 h.h2->tp_net = netoff;
770 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
771 && shhwtstamps->syststamp.tv64)
772 ts = ktime_to_timespec(shhwtstamps->syststamp);
773 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
774 && shhwtstamps->hwtstamp.tv64)
775 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
776 else if (skb->tstamp.tv64)
777 ts = ktime_to_timespec(skb->tstamp);
780 h.h2->tp_sec = ts.tv_sec;
781 h.h2->tp_nsec = ts.tv_nsec;
782 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
783 hdrlen = sizeof(*h.h2);
789 sll = h.raw + TPACKET_ALIGN(hdrlen);
790 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
791 sll->sll_family = AF_PACKET;
792 sll->sll_hatype = dev->type;
793 sll->sll_protocol = skb->protocol;
794 sll->sll_pkttype = skb->pkt_type;
795 if (unlikely(po->origdev))
796 sll->sll_ifindex = orig_dev->ifindex;
798 sll->sll_ifindex = dev->ifindex;
800 __packet_set_status(po, h.raw, status);
803 struct page *p_start, *p_end;
804 u8 *h_end = h.raw + macoff + snaplen - 1;
806 p_start = virt_to_page(h.raw);
807 p_end = virt_to_page(h_end);
808 while (p_start <= p_end) {
809 flush_dcache_page(p_start);
814 sk->sk_data_ready(sk, 0);
817 if (skb_head != skb->data && skb_shared(skb)) {
818 skb->data = skb_head;
826 po->stats.tp_drops++;
827 spin_unlock(&sk->sk_receive_queue.lock);
829 sk->sk_data_ready(sk, 0);
834 static void tpacket_destruct_skb(struct sk_buff *skb)
836 struct packet_sock *po = pkt_sk(skb->sk);
841 if (likely(po->tx_ring.pg_vec)) {
842 ph = skb_shinfo(skb)->destructor_arg;
843 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
844 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
845 atomic_dec(&po->tx_ring.pending);
846 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
852 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
853 void *frame, struct net_device *dev, int size_max,
854 __be16 proto, unsigned char *addr)
857 struct tpacket_hdr *h1;
858 struct tpacket2_hdr *h2;
861 int to_write, offset, len, tp_len, nr_frags, len_max;
862 struct socket *sock = po->sk.sk_socket;
869 skb->protocol = proto;
871 skb->priority = po->sk.sk_priority;
872 skb->mark = po->sk.sk_mark;
873 skb_shinfo(skb)->destructor_arg = ph.raw;
875 switch (po->tp_version) {
877 tp_len = ph.h2->tp_len;
880 tp_len = ph.h1->tp_len;
883 if (unlikely(tp_len > size_max)) {
884 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
888 skb_reserve(skb, LL_RESERVED_SPACE(dev));
889 skb_reset_network_header(skb);
891 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
894 if (sock->type == SOCK_DGRAM) {
895 err = dev_hard_header(skb, dev, ntohs(proto), addr,
897 if (unlikely(err < 0))
899 } else if (dev->hard_header_len) {
900 /* net device doesn't like empty head */
901 if (unlikely(tp_len <= dev->hard_header_len)) {
902 pr_err("packet size is too short (%d < %d)\n",
903 tp_len, dev->hard_header_len);
907 skb_push(skb, dev->hard_header_len);
908 err = skb_store_bits(skb, 0, data,
909 dev->hard_header_len);
913 data += dev->hard_header_len;
914 to_write -= dev->hard_header_len;
918 page = virt_to_page(data);
919 offset = offset_in_page(data);
920 len_max = PAGE_SIZE - offset;
921 len = ((to_write > len_max) ? len_max : to_write);
923 skb->data_len = to_write;
924 skb->len += to_write;
925 skb->truesize += to_write;
926 atomic_add(to_write, &po->sk.sk_wmem_alloc);
928 while (likely(to_write)) {
929 nr_frags = skb_shinfo(skb)->nr_frags;
931 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
932 pr_err("Packet exceed the number of skb frags(%lu)\n",
937 flush_dcache_page(page);
939 skb_fill_page_desc(skb,
941 page++, offset, len);
945 len = ((to_write > len_max) ? len_max : to_write);
951 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
955 struct net_device *dev;
957 int ifindex, err, reserve = 0;
959 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
960 int tp_len, size_max;
965 sock = po->sk.sk_socket;
967 mutex_lock(&po->pg_vec_lock);
971 ifindex = po->ifindex;
976 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
978 if (msg->msg_namelen < (saddr->sll_halen
979 + offsetof(struct sockaddr_ll,
982 ifindex = saddr->sll_ifindex;
983 proto = saddr->sll_protocol;
984 addr = saddr->sll_addr;
987 dev = dev_get_by_index(sock_net(&po->sk), ifindex);
989 if (unlikely(dev == NULL))
992 reserve = dev->hard_header_len;
995 if (unlikely(!(dev->flags & IFF_UP)))
998 size_max = po->tx_ring.frame_size
999 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
1001 if (size_max > dev->mtu + reserve)
1002 size_max = dev->mtu + reserve;
1005 ph = packet_current_frame(po, &po->tx_ring,
1006 TP_STATUS_SEND_REQUEST);
1008 if (unlikely(ph == NULL)) {
1013 status = TP_STATUS_SEND_REQUEST;
1014 skb = sock_alloc_send_skb(&po->sk,
1015 LL_ALLOCATED_SPACE(dev)
1016 + sizeof(struct sockaddr_ll),
1019 if (unlikely(skb == NULL))
1022 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1025 if (unlikely(tp_len < 0)) {
1027 __packet_set_status(po, ph,
1028 TP_STATUS_AVAILABLE);
1029 packet_increment_head(&po->tx_ring);
1033 status = TP_STATUS_WRONG_FORMAT;
1039 skb->destructor = tpacket_destruct_skb;
1040 __packet_set_status(po, ph, TP_STATUS_SENDING);
1041 atomic_inc(&po->tx_ring.pending);
1043 status = TP_STATUS_SEND_REQUEST;
1044 err = dev_queue_xmit(skb);
1045 if (unlikely(err > 0)) {
1046 err = net_xmit_errno(err);
1047 if (err && __packet_get_status(po, ph) ==
1048 TP_STATUS_AVAILABLE) {
1049 /* skb was destructed already */
1054 * skb was dropped but not destructed yet;
1055 * let's treat it like congestion or err < 0
1059 packet_increment_head(&po->tx_ring);
1061 } while (likely((ph != NULL) ||
1062 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1063 (atomic_read(&po->tx_ring.pending))))
1070 __packet_set_status(po, ph, status);
1075 mutex_unlock(&po->pg_vec_lock);
1079 static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1080 size_t reserve, size_t len,
1081 size_t linear, int noblock,
1084 struct sk_buff *skb;
1086 /* Under a page? Don't bother with paged skb. */
1087 if (prepad + len < PAGE_SIZE || !linear)
1090 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1095 skb_reserve(skb, reserve);
1096 skb_put(skb, linear);
1097 skb->data_len = len - linear;
1098 skb->len += len - linear;
1103 static int packet_snd(struct socket *sock,
1104 struct msghdr *msg, size_t len)
1106 struct sock *sk = sock->sk;
1107 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1108 struct sk_buff *skb;
1109 struct net_device *dev;
1111 unsigned char *addr;
1112 int ifindex, err, reserve = 0;
1113 struct virtio_net_hdr vnet_hdr = { 0 };
1116 struct packet_sock *po = pkt_sk(sk);
1117 unsigned short gso_type = 0;
1120 * Get and verify the address.
1123 if (saddr == NULL) {
1124 ifindex = po->ifindex;
1129 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1131 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1133 ifindex = saddr->sll_ifindex;
1134 proto = saddr->sll_protocol;
1135 addr = saddr->sll_addr;
1139 dev = dev_get_by_index(sock_net(sk), ifindex);
1143 if (sock->type == SOCK_RAW)
1144 reserve = dev->hard_header_len;
1147 if (!(dev->flags & IFF_UP))
1150 if (po->has_vnet_hdr) {
1151 vnet_hdr_len = sizeof(vnet_hdr);
1154 if (len < vnet_hdr_len)
1157 len -= vnet_hdr_len;
1159 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1164 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1165 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1167 vnet_hdr.hdr_len = vnet_hdr.csum_start +
1168 vnet_hdr.csum_offset + 2;
1171 if (vnet_hdr.hdr_len > len)
1174 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1175 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1176 case VIRTIO_NET_HDR_GSO_TCPV4:
1177 gso_type = SKB_GSO_TCPV4;
1179 case VIRTIO_NET_HDR_GSO_TCPV6:
1180 gso_type = SKB_GSO_TCPV6;
1182 case VIRTIO_NET_HDR_GSO_UDP:
1183 gso_type = SKB_GSO_UDP;
1189 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1190 gso_type |= SKB_GSO_TCP_ECN;
1192 if (vnet_hdr.gso_size == 0)
1199 if (!gso_type && (len > dev->mtu+reserve))
1203 skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1204 LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1205 msg->msg_flags & MSG_DONTWAIT, &err);
1209 skb_set_network_header(skb, reserve);
1212 if (sock->type == SOCK_DGRAM &&
1213 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
1216 /* Returns -EFAULT on error */
1217 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1220 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1224 skb->protocol = proto;
1226 skb->priority = sk->sk_priority;
1227 skb->mark = sk->sk_mark;
1229 if (po->has_vnet_hdr) {
1230 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1231 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1232 vnet_hdr.csum_offset)) {
1238 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1239 skb_shinfo(skb)->gso_type = gso_type;
1241 /* Header must be checked, and gso_segs computed. */
1242 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1243 skb_shinfo(skb)->gso_segs = 0;
1245 len += vnet_hdr_len;
1252 err = dev_queue_xmit(skb);
1253 if (err > 0 && (err = net_xmit_errno(err)) != 0)
1269 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1270 struct msghdr *msg, size_t len)
1272 struct sock *sk = sock->sk;
1273 struct packet_sock *po = pkt_sk(sk);
1274 if (po->tx_ring.pg_vec)
1275 return tpacket_snd(po, msg);
1277 return packet_snd(sock, msg, len);
1281 * Close a PACKET socket. This is fairly simple. We immediately go
1282 * to 'closed' state and remove our protocol entry in the device list.
1285 static int packet_release(struct socket *sock)
1287 struct sock *sk = sock->sk;
1288 struct packet_sock *po;
1290 struct tpacket_req req;
1298 spin_lock_bh(&net->packet.sklist_lock);
1299 sk_del_node_init_rcu(sk);
1300 sock_prot_inuse_add(net, sk->sk_prot, -1);
1301 spin_unlock_bh(&net->packet.sklist_lock);
1303 spin_lock(&po->bind_lock);
1306 * Remove from protocol table
1310 __dev_remove_pack(&po->prot_hook);
1313 spin_unlock(&po->bind_lock);
1315 packet_flush_mclist(sk);
1317 memset(&req, 0, sizeof(req));
1319 if (po->rx_ring.pg_vec)
1320 packet_set_ring(sk, &req, 1, 0);
1322 if (po->tx_ring.pg_vec)
1323 packet_set_ring(sk, &req, 1, 1);
1327 * Now the socket is dead. No more input will appear.
1334 skb_queue_purge(&sk->sk_receive_queue);
1335 sk_refcnt_debug_release(sk);
1342 * Attach a packet hook.
1345 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1347 struct packet_sock *po = pkt_sk(sk);
1349 * Detach an existing hook if present.
1354 spin_lock(&po->bind_lock);
1359 spin_unlock(&po->bind_lock);
1360 dev_remove_pack(&po->prot_hook);
1361 spin_lock(&po->bind_lock);
1365 po->prot_hook.type = protocol;
1366 po->prot_hook.dev = dev;
1368 po->ifindex = dev ? dev->ifindex : 0;
1373 if (!dev || (dev->flags & IFF_UP)) {
1374 dev_add_pack(&po->prot_hook);
1378 sk->sk_err = ENETDOWN;
1379 if (!sock_flag(sk, SOCK_DEAD))
1380 sk->sk_error_report(sk);
1384 spin_unlock(&po->bind_lock);
1390 * Bind a packet socket to a device
1393 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1396 struct sock *sk = sock->sk;
1398 struct net_device *dev;
1405 if (addr_len != sizeof(struct sockaddr))
1407 strlcpy(name, uaddr->sa_data, sizeof(name));
1409 dev = dev_get_by_name(sock_net(sk), name);
1411 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1417 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1419 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1420 struct sock *sk = sock->sk;
1421 struct net_device *dev = NULL;
1429 if (addr_len < sizeof(struct sockaddr_ll))
1431 if (sll->sll_family != AF_PACKET)
1434 if (sll->sll_ifindex) {
1436 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1440 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1448 static struct proto packet_proto = {
1450 .owner = THIS_MODULE,
1451 .obj_size = sizeof(struct packet_sock),
1455 * Create a packet of type SOCK_PACKET.
1458 static int packet_create(struct net *net, struct socket *sock, int protocol,
1462 struct packet_sock *po;
1463 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1466 if (!capable(CAP_NET_RAW))
1468 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1469 sock->type != SOCK_PACKET)
1470 return -ESOCKTNOSUPPORT;
1472 sock->state = SS_UNCONNECTED;
1475 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1479 sock->ops = &packet_ops;
1480 if (sock->type == SOCK_PACKET)
1481 sock->ops = &packet_ops_spkt;
1483 sock_init_data(sock, sk);
1486 sk->sk_family = PF_PACKET;
1489 sk->sk_destruct = packet_sock_destruct;
1490 sk_refcnt_debug_inc(sk);
1493 * Attach a protocol block
1496 spin_lock_init(&po->bind_lock);
1497 mutex_init(&po->pg_vec_lock);
1498 po->prot_hook.func = packet_rcv;
1500 if (sock->type == SOCK_PACKET)
1501 po->prot_hook.func = packet_rcv_spkt;
1503 po->prot_hook.af_packet_priv = sk;
1506 po->prot_hook.type = proto;
1507 dev_add_pack(&po->prot_hook);
1512 spin_lock_bh(&net->packet.sklist_lock);
1513 sk_add_node_rcu(sk, &net->packet.sklist);
1514 sock_prot_inuse_add(net, &packet_proto, 1);
1515 spin_unlock_bh(&net->packet.sklist_lock);
1522 static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
1524 struct sock_exterr_skb *serr;
1525 struct sk_buff *skb, *skb2;
1529 skb = skb_dequeue(&sk->sk_error_queue);
1535 msg->msg_flags |= MSG_TRUNC;
1538 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1542 sock_recv_timestamp(msg, sk, skb);
1544 serr = SKB_EXT_ERR(skb);
1545 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
1546 sizeof(serr->ee), &serr->ee);
1548 msg->msg_flags |= MSG_ERRQUEUE;
1551 /* Reset and regenerate socket error */
1552 spin_lock_bh(&sk->sk_error_queue.lock);
1554 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
1555 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
1556 spin_unlock_bh(&sk->sk_error_queue.lock);
1557 sk->sk_error_report(sk);
1559 spin_unlock_bh(&sk->sk_error_queue.lock);
1568 * Pull a packet from our receive queue and hand it to the user.
1569 * If necessary we block.
1572 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1573 struct msghdr *msg, size_t len, int flags)
1575 struct sock *sk = sock->sk;
1576 struct sk_buff *skb;
1578 struct sockaddr_ll *sll;
1579 int vnet_hdr_len = 0;
1582 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1586 /* What error should we return now? EUNATTACH? */
1587 if (pkt_sk(sk)->ifindex < 0)
1591 if (flags & MSG_ERRQUEUE) {
1592 err = packet_recv_error(sk, msg, len);
1597 * Call the generic datagram receiver. This handles all sorts
1598 * of horrible races and re-entrancy so we can forget about it
1599 * in the protocol layers.
1601 * Now it will return ENETDOWN, if device have just gone down,
1602 * but then it will block.
1605 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1608 * An error occurred so return it. Because skb_recv_datagram()
1609 * handles the blocking we don't see and worry about blocking
1616 if (pkt_sk(sk)->has_vnet_hdr) {
1617 struct virtio_net_hdr vnet_hdr = { 0 };
1620 vnet_hdr_len = sizeof(vnet_hdr);
1621 if (len < vnet_hdr_len)
1624 len -= vnet_hdr_len;
1626 if (skb_is_gso(skb)) {
1627 struct skb_shared_info *sinfo = skb_shinfo(skb);
1629 /* This is a hint as to how much should be linear. */
1630 vnet_hdr.hdr_len = skb_headlen(skb);
1631 vnet_hdr.gso_size = sinfo->gso_size;
1632 if (sinfo->gso_type & SKB_GSO_TCPV4)
1633 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1634 else if (sinfo->gso_type & SKB_GSO_TCPV6)
1635 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1636 else if (sinfo->gso_type & SKB_GSO_UDP)
1637 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1638 else if (sinfo->gso_type & SKB_GSO_FCOE)
1642 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1643 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1645 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1647 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1648 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1649 vnet_hdr.csum_start = skb->csum_start -
1651 vnet_hdr.csum_offset = skb->csum_offset;
1652 } /* else everything is zero */
1654 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1661 * If the address length field is there to be filled in, we fill
1665 sll = &PACKET_SKB_CB(skb)->sa.ll;
1666 if (sock->type == SOCK_PACKET)
1667 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1669 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1672 * You lose any data beyond the buffer you gave. If it worries a
1673 * user program they can ask the device for its MTU anyway.
1679 msg->msg_flags |= MSG_TRUNC;
1682 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1686 sock_recv_ts_and_drops(msg, sk, skb);
1689 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1692 if (pkt_sk(sk)->auxdata) {
1693 struct tpacket_auxdata aux;
1695 aux.tp_status = TP_STATUS_USER;
1696 if (skb->ip_summed == CHECKSUM_PARTIAL)
1697 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1698 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1699 aux.tp_snaplen = skb->len;
1701 aux.tp_net = skb_network_offset(skb);
1702 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1704 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1708 * Free or return the buffer as appropriate. Again this
1709 * hides all the races and re-entrancy issues from us.
1711 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1714 skb_free_datagram(sk, skb);
1719 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1720 int *uaddr_len, int peer)
1722 struct net_device *dev;
1723 struct sock *sk = sock->sk;
1728 uaddr->sa_family = AF_PACKET;
1730 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1732 strncpy(uaddr->sa_data, dev->name, 14);
1734 memset(uaddr->sa_data, 0, 14);
1736 *uaddr_len = sizeof(*uaddr);
1741 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1742 int *uaddr_len, int peer)
1744 struct net_device *dev;
1745 struct sock *sk = sock->sk;
1746 struct packet_sock *po = pkt_sk(sk);
1747 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1752 sll->sll_family = AF_PACKET;
1753 sll->sll_ifindex = po->ifindex;
1754 sll->sll_protocol = po->num;
1755 sll->sll_pkttype = 0;
1757 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1759 sll->sll_hatype = dev->type;
1760 sll->sll_halen = dev->addr_len;
1761 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1763 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
1767 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1772 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1776 case PACKET_MR_MULTICAST:
1777 if (i->alen != dev->addr_len)
1780 return dev_mc_add(dev, i->addr);
1782 return dev_mc_del(dev, i->addr);
1784 case PACKET_MR_PROMISC:
1785 return dev_set_promiscuity(dev, what);
1787 case PACKET_MR_ALLMULTI:
1788 return dev_set_allmulti(dev, what);
1790 case PACKET_MR_UNICAST:
1791 if (i->alen != dev->addr_len)
1794 return dev_uc_add(dev, i->addr);
1796 return dev_uc_del(dev, i->addr);
1804 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1806 for ( ; i; i = i->next) {
1807 if (i->ifindex == dev->ifindex)
1808 packet_dev_mc(dev, i, what);
1812 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1814 struct packet_sock *po = pkt_sk(sk);
1815 struct packet_mclist *ml, *i;
1816 struct net_device *dev;
1822 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1827 if (mreq->mr_alen > dev->addr_len)
1831 i = kmalloc(sizeof(*i), GFP_KERNEL);
1836 for (ml = po->mclist; ml; ml = ml->next) {
1837 if (ml->ifindex == mreq->mr_ifindex &&
1838 ml->type == mreq->mr_type &&
1839 ml->alen == mreq->mr_alen &&
1840 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1842 /* Free the new element ... */
1848 i->type = mreq->mr_type;
1849 i->ifindex = mreq->mr_ifindex;
1850 i->alen = mreq->mr_alen;
1851 memcpy(i->addr, mreq->mr_address, i->alen);
1853 i->next = po->mclist;
1855 err = packet_dev_mc(dev, i, 1);
1857 po->mclist = i->next;
1866 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1868 struct packet_mclist *ml, **mlp;
1872 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1873 if (ml->ifindex == mreq->mr_ifindex &&
1874 ml->type == mreq->mr_type &&
1875 ml->alen == mreq->mr_alen &&
1876 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1877 if (--ml->count == 0) {
1878 struct net_device *dev;
1880 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1882 packet_dev_mc(dev, ml, -1);
1890 return -EADDRNOTAVAIL;
1893 static void packet_flush_mclist(struct sock *sk)
1895 struct packet_sock *po = pkt_sk(sk);
1896 struct packet_mclist *ml;
1902 while ((ml = po->mclist) != NULL) {
1903 struct net_device *dev;
1905 po->mclist = ml->next;
1906 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1908 packet_dev_mc(dev, ml, -1);
1915 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1917 struct sock *sk = sock->sk;
1918 struct packet_sock *po = pkt_sk(sk);
1921 if (level != SOL_PACKET)
1922 return -ENOPROTOOPT;
1925 case PACKET_ADD_MEMBERSHIP:
1926 case PACKET_DROP_MEMBERSHIP:
1928 struct packet_mreq_max mreq;
1930 memset(&mreq, 0, sizeof(mreq));
1931 if (len < sizeof(struct packet_mreq))
1933 if (len > sizeof(mreq))
1935 if (copy_from_user(&mreq, optval, len))
1937 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1939 if (optname == PACKET_ADD_MEMBERSHIP)
1940 ret = packet_mc_add(sk, &mreq);
1942 ret = packet_mc_drop(sk, &mreq);
1946 case PACKET_RX_RING:
1947 case PACKET_TX_RING:
1949 struct tpacket_req req;
1951 if (optlen < sizeof(req))
1953 if (pkt_sk(sk)->has_vnet_hdr)
1955 if (copy_from_user(&req, optval, sizeof(req)))
1957 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1959 case PACKET_COPY_THRESH:
1963 if (optlen != sizeof(val))
1965 if (copy_from_user(&val, optval, sizeof(val)))
1968 pkt_sk(sk)->copy_thresh = val;
1971 case PACKET_VERSION:
1975 if (optlen != sizeof(val))
1977 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1979 if (copy_from_user(&val, optval, sizeof(val)))
1984 po->tp_version = val;
1990 case PACKET_RESERVE:
1994 if (optlen != sizeof(val))
1996 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1998 if (copy_from_user(&val, optval, sizeof(val)))
2000 po->tp_reserve = val;
2007 if (optlen != sizeof(val))
2009 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2011 if (copy_from_user(&val, optval, sizeof(val)))
2013 po->tp_loss = !!val;
2016 case PACKET_AUXDATA:
2020 if (optlen < sizeof(val))
2022 if (copy_from_user(&val, optval, sizeof(val)))
2025 po->auxdata = !!val;
2028 case PACKET_ORIGDEV:
2032 if (optlen < sizeof(val))
2034 if (copy_from_user(&val, optval, sizeof(val)))
2037 po->origdev = !!val;
2040 case PACKET_VNET_HDR:
2044 if (sock->type != SOCK_RAW)
2046 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2048 if (optlen < sizeof(val))
2050 if (copy_from_user(&val, optval, sizeof(val)))
2053 po->has_vnet_hdr = !!val;
2056 case PACKET_TIMESTAMP:
2060 if (optlen != sizeof(val))
2062 if (copy_from_user(&val, optval, sizeof(val)))
2065 po->tp_tstamp = val;
2069 return -ENOPROTOOPT;
2073 static int packet_getsockopt(struct socket *sock, int level, int optname,
2074 char __user *optval, int __user *optlen)
2078 struct sock *sk = sock->sk;
2079 struct packet_sock *po = pkt_sk(sk);
2081 struct tpacket_stats st;
2083 if (level != SOL_PACKET)
2084 return -ENOPROTOOPT;
2086 if (get_user(len, optlen))
2093 case PACKET_STATISTICS:
2094 if (len > sizeof(struct tpacket_stats))
2095 len = sizeof(struct tpacket_stats);
2096 spin_lock_bh(&sk->sk_receive_queue.lock);
2098 memset(&po->stats, 0, sizeof(st));
2099 spin_unlock_bh(&sk->sk_receive_queue.lock);
2100 st.tp_packets += st.tp_drops;
2104 case PACKET_AUXDATA:
2105 if (len > sizeof(int))
2111 case PACKET_ORIGDEV:
2112 if (len > sizeof(int))
2118 case PACKET_VNET_HDR:
2119 if (len > sizeof(int))
2121 val = po->has_vnet_hdr;
2125 case PACKET_VERSION:
2126 if (len > sizeof(int))
2128 val = po->tp_version;
2132 if (len > sizeof(int))
2134 if (copy_from_user(&val, optval, len))
2138 val = sizeof(struct tpacket_hdr);
2141 val = sizeof(struct tpacket2_hdr);
2148 case PACKET_RESERVE:
2149 if (len > sizeof(unsigned int))
2150 len = sizeof(unsigned int);
2151 val = po->tp_reserve;
2155 if (len > sizeof(unsigned int))
2156 len = sizeof(unsigned int);
2160 case PACKET_TIMESTAMP:
2161 if (len > sizeof(int))
2163 val = po->tp_tstamp;
2167 return -ENOPROTOOPT;
2170 if (put_user(len, optlen))
2172 if (copy_to_user(optval, data, len))
2178 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2181 struct hlist_node *node;
2182 struct net_device *dev = data;
2183 struct net *net = dev_net(dev);
2186 sk_for_each_rcu(sk, node, &net->packet.sklist) {
2187 struct packet_sock *po = pkt_sk(sk);
2190 case NETDEV_UNREGISTER:
2192 packet_dev_mclist(dev, po->mclist, -1);
2196 if (dev->ifindex == po->ifindex) {
2197 spin_lock(&po->bind_lock);
2199 __dev_remove_pack(&po->prot_hook);
2202 sk->sk_err = ENETDOWN;
2203 if (!sock_flag(sk, SOCK_DEAD))
2204 sk->sk_error_report(sk);
2206 if (msg == NETDEV_UNREGISTER) {
2208 po->prot_hook.dev = NULL;
2210 spin_unlock(&po->bind_lock);
2214 if (dev->ifindex == po->ifindex) {
2215 spin_lock(&po->bind_lock);
2216 if (po->num && !po->running) {
2217 dev_add_pack(&po->prot_hook);
2221 spin_unlock(&po->bind_lock);
2231 static int packet_ioctl(struct socket *sock, unsigned int cmd,
2234 struct sock *sk = sock->sk;
2239 int amount = sk_wmem_alloc_get(sk);
2241 return put_user(amount, (int __user *)arg);
2245 struct sk_buff *skb;
2248 spin_lock_bh(&sk->sk_receive_queue.lock);
2249 skb = skb_peek(&sk->sk_receive_queue);
2252 spin_unlock_bh(&sk->sk_receive_queue.lock);
2253 return put_user(amount, (int __user *)arg);
2256 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2258 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2268 case SIOCGIFBRDADDR:
2269 case SIOCSIFBRDADDR:
2270 case SIOCGIFNETMASK:
2271 case SIOCSIFNETMASK:
2272 case SIOCGIFDSTADDR:
2273 case SIOCSIFDSTADDR:
2275 return inet_dgram_ops.ioctl(sock, cmd, arg);
2279 return -ENOIOCTLCMD;
2284 static unsigned int packet_poll(struct file *file, struct socket *sock,
2287 struct sock *sk = sock->sk;
2288 struct packet_sock *po = pkt_sk(sk);
2289 unsigned int mask = datagram_poll(file, sock, wait);
2291 spin_lock_bh(&sk->sk_receive_queue.lock);
2292 if (po->rx_ring.pg_vec) {
2293 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2294 mask |= POLLIN | POLLRDNORM;
2296 spin_unlock_bh(&sk->sk_receive_queue.lock);
2297 spin_lock_bh(&sk->sk_write_queue.lock);
2298 if (po->tx_ring.pg_vec) {
2299 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2300 mask |= POLLOUT | POLLWRNORM;
2302 spin_unlock_bh(&sk->sk_write_queue.lock);
2307 /* Dirty? Well, I still did not learn better way to account
2311 static void packet_mm_open(struct vm_area_struct *vma)
2313 struct file *file = vma->vm_file;
2314 struct socket *sock = file->private_data;
2315 struct sock *sk = sock->sk;
2318 atomic_inc(&pkt_sk(sk)->mapped);
2321 static void packet_mm_close(struct vm_area_struct *vma)
2323 struct file *file = vma->vm_file;
2324 struct socket *sock = file->private_data;
2325 struct sock *sk = sock->sk;
2328 atomic_dec(&pkt_sk(sk)->mapped);
2331 static const struct vm_operations_struct packet_mmap_ops = {
2332 .open = packet_mm_open,
2333 .close = packet_mm_close,
2336 static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
2341 for (i = 0; i < len; i++) {
2342 if (likely(pg_vec[i].buffer)) {
2343 if (pg_vec[i].flags & PGV_FROM_VMALLOC)
2344 vfree(pg_vec[i].buffer);
2346 free_pages((unsigned long)pg_vec[i].buffer,
2348 pg_vec[i].buffer = NULL;
2354 static inline char *alloc_one_pg_vec_page(unsigned long order,
2355 unsigned char *flags)
2357 char *buffer = NULL;
2358 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
2359 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
2361 buffer = (char *) __get_free_pages(gfp_flags, order);
2367 * __get_free_pages failed, fall back to vmalloc
2369 *flags |= PGV_FROM_VMALLOC;
2370 buffer = vmalloc((1 << order) * PAGE_SIZE);
2376 * vmalloc failed, lets dig into swap here
2379 gfp_flags &= ~__GFP_NORETRY;
2380 buffer = (char *)__get_free_pages(gfp_flags, order);
2385 * complete and utter failure
2390 static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
2392 unsigned int block_nr = req->tp_block_nr;
2396 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
2397 if (unlikely(!pg_vec))
2400 for (i = 0; i < block_nr; i++) {
2401 pg_vec[i].buffer = alloc_one_pg_vec_page(order,
2403 if (unlikely(!pg_vec[i].buffer))
2404 goto out_free_pgvec;
2411 free_pg_vec(pg_vec, order, block_nr);
2417 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2418 int closing, int tx_ring)
2420 struct pgv *pg_vec = NULL;
2421 struct packet_sock *po = pkt_sk(sk);
2422 int was_running, order = 0;
2423 struct packet_ring_buffer *rb;
2424 struct sk_buff_head *rb_queue;
2428 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2429 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2433 if (atomic_read(&po->mapped))
2435 if (atomic_read(&rb->pending))
2439 if (req->tp_block_nr) {
2440 /* Sanity tests and some calculations */
2442 if (unlikely(rb->pg_vec))
2445 switch (po->tp_version) {
2447 po->tp_hdrlen = TPACKET_HDRLEN;
2450 po->tp_hdrlen = TPACKET2_HDRLEN;
2455 if (unlikely((int)req->tp_block_size <= 0))
2457 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2459 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2462 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2465 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2466 if (unlikely(rb->frames_per_block <= 0))
2468 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2473 order = get_order(req->tp_block_size);
2474 pg_vec = alloc_pg_vec(req, order);
2475 if (unlikely(!pg_vec))
2481 if (unlikely(req->tp_frame_nr))
2487 /* Detach socket from network */
2488 spin_lock(&po->bind_lock);
2489 was_running = po->running;
2492 __dev_remove_pack(&po->prot_hook);
2497 spin_unlock(&po->bind_lock);
2502 mutex_lock(&po->pg_vec_lock);
2503 if (closing || atomic_read(&po->mapped) == 0) {
2505 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2506 spin_lock_bh(&rb_queue->lock);
2507 pg_vec = XC(rb->pg_vec, pg_vec);
2508 rb->frame_max = (req->tp_frame_nr - 1);
2510 rb->frame_size = req->tp_frame_size;
2511 spin_unlock_bh(&rb_queue->lock);
2513 order = XC(rb->pg_vec_order, order);
2514 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2516 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2517 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2518 tpacket_rcv : packet_rcv;
2519 skb_queue_purge(rb_queue);
2521 if (atomic_read(&po->mapped))
2522 pr_err("packet_mmap: vma is busy: %d\n",
2523 atomic_read(&po->mapped));
2525 mutex_unlock(&po->pg_vec_lock);
2527 spin_lock(&po->bind_lock);
2528 if (was_running && !po->running) {
2532 dev_add_pack(&po->prot_hook);
2534 spin_unlock(&po->bind_lock);
2539 free_pg_vec(pg_vec, order, req->tp_block_nr);
2544 static int packet_mmap(struct file *file, struct socket *sock,
2545 struct vm_area_struct *vma)
2547 struct sock *sk = sock->sk;
2548 struct packet_sock *po = pkt_sk(sk);
2549 unsigned long size, expected_size;
2550 struct packet_ring_buffer *rb;
2551 unsigned long start;
2558 mutex_lock(&po->pg_vec_lock);
2561 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2563 expected_size += rb->pg_vec_len
2569 if (expected_size == 0)
2572 size = vma->vm_end - vma->vm_start;
2573 if (size != expected_size)
2576 start = vma->vm_start;
2577 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2578 if (rb->pg_vec == NULL)
2581 for (i = 0; i < rb->pg_vec_len; i++) {
2583 void *kaddr = rb->pg_vec[i].buffer;
2586 for (pg_num = 0; pg_num < rb->pg_vec_pages;
2588 if (rb->pg_vec[i].flags & PGV_FROM_VMALLOC)
2589 page = vmalloc_to_page(kaddr);
2591 page = virt_to_page(kaddr);
2593 err = vm_insert_page(vma, start, page);
2602 atomic_inc(&po->mapped);
2603 vma->vm_ops = &packet_mmap_ops;
2607 mutex_unlock(&po->pg_vec_lock);
2611 static const struct proto_ops packet_ops_spkt = {
2612 .family = PF_PACKET,
2613 .owner = THIS_MODULE,
2614 .release = packet_release,
2615 .bind = packet_bind_spkt,
2616 .connect = sock_no_connect,
2617 .socketpair = sock_no_socketpair,
2618 .accept = sock_no_accept,
2619 .getname = packet_getname_spkt,
2620 .poll = datagram_poll,
2621 .ioctl = packet_ioctl,
2622 .listen = sock_no_listen,
2623 .shutdown = sock_no_shutdown,
2624 .setsockopt = sock_no_setsockopt,
2625 .getsockopt = sock_no_getsockopt,
2626 .sendmsg = packet_sendmsg_spkt,
2627 .recvmsg = packet_recvmsg,
2628 .mmap = sock_no_mmap,
2629 .sendpage = sock_no_sendpage,
2632 static const struct proto_ops packet_ops = {
2633 .family = PF_PACKET,
2634 .owner = THIS_MODULE,
2635 .release = packet_release,
2636 .bind = packet_bind,
2637 .connect = sock_no_connect,
2638 .socketpair = sock_no_socketpair,
2639 .accept = sock_no_accept,
2640 .getname = packet_getname,
2641 .poll = packet_poll,
2642 .ioctl = packet_ioctl,
2643 .listen = sock_no_listen,
2644 .shutdown = sock_no_shutdown,
2645 .setsockopt = packet_setsockopt,
2646 .getsockopt = packet_getsockopt,
2647 .sendmsg = packet_sendmsg,
2648 .recvmsg = packet_recvmsg,
2649 .mmap = packet_mmap,
2650 .sendpage = sock_no_sendpage,
2653 static const struct net_proto_family packet_family_ops = {
2654 .family = PF_PACKET,
2655 .create = packet_create,
2656 .owner = THIS_MODULE,
2659 static struct notifier_block packet_netdev_notifier = {
2660 .notifier_call = packet_notifier,
2663 #ifdef CONFIG_PROC_FS
2665 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2668 struct net *net = seq_file_net(seq);
2671 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
2674 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2676 struct net *net = seq_file_net(seq);
2677 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
2680 static void packet_seq_stop(struct seq_file *seq, void *v)
2686 static int packet_seq_show(struct seq_file *seq, void *v)
2688 if (v == SEQ_START_TOKEN)
2689 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
2691 struct sock *s = sk_entry(v);
2692 const struct packet_sock *po = pkt_sk(s);
2695 "%p %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
2697 atomic_read(&s->sk_refcnt),
2702 atomic_read(&s->sk_rmem_alloc),
2710 static const struct seq_operations packet_seq_ops = {
2711 .start = packet_seq_start,
2712 .next = packet_seq_next,
2713 .stop = packet_seq_stop,
2714 .show = packet_seq_show,
2717 static int packet_seq_open(struct inode *inode, struct file *file)
2719 return seq_open_net(inode, file, &packet_seq_ops,
2720 sizeof(struct seq_net_private));
2723 static const struct file_operations packet_seq_fops = {
2724 .owner = THIS_MODULE,
2725 .open = packet_seq_open,
2727 .llseek = seq_lseek,
2728 .release = seq_release_net,
2733 static int __net_init packet_net_init(struct net *net)
2735 spin_lock_init(&net->packet.sklist_lock);
2736 INIT_HLIST_HEAD(&net->packet.sklist);
2738 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2744 static void __net_exit packet_net_exit(struct net *net)
2746 proc_net_remove(net, "packet");
2749 static struct pernet_operations packet_net_ops = {
2750 .init = packet_net_init,
2751 .exit = packet_net_exit,
2755 static void __exit packet_exit(void)
2757 unregister_netdevice_notifier(&packet_netdev_notifier);
2758 unregister_pernet_subsys(&packet_net_ops);
2759 sock_unregister(PF_PACKET);
2760 proto_unregister(&packet_proto);
2763 static int __init packet_init(void)
2765 int rc = proto_register(&packet_proto, 0);
2770 sock_register(&packet_family_ops);
2771 register_pernet_subsys(&packet_net_ops);
2772 register_netdevice_notifier(&packet_netdev_notifier);
2777 module_init(packet_init);
2778 module_exit(packet_exit);
2779 MODULE_LICENSE("GPL");
2780 MODULE_ALIAS_NETPROTO(PF_PACKET);