2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
10 * ip_vs_sync: sync connection info from master load balancer to backups
14 * Alexandre Cassen : Added master & backup support at a time.
15 * Alexandre Cassen : Added SyncID support for incoming sync
17 * Justin Ossevoort : Fix endian problem on sync message size.
20 #define KMSG_COMPONENT "IPVS"
21 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23 #include <linux/module.h>
24 #include <linux/slab.h>
25 #include <linux/inetdevice.h>
26 #include <linux/net.h>
27 #include <linux/completion.h>
28 #include <linux/delay.h>
29 #include <linux/skbuff.h>
31 #include <linux/igmp.h> /* for ip_mc_join_group */
32 #include <linux/udp.h>
33 #include <linux/err.h>
34 #include <linux/kthread.h>
35 #include <linux/wait.h>
36 #include <linux/kernel.h>
41 #include <net/ip_vs.h>
43 #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
44 #define IP_VS_SYNC_PORT 8848 /* multicast port */
46 #define SYNC_PROTO_VER 1 /* Protocol version in header */
49 * IPVS sync connection entry
50 * Version 0, i.e. original version.
52 struct ip_vs_sync_conn_v0 {
55 /* Protocol, addresses and port numbers */
56 __u8 protocol; /* Which protocol (TCP/UDP) */
60 __be32 caddr; /* client address */
61 __be32 vaddr; /* virtual address */
62 __be32 daddr; /* destination address */
64 /* Flags and state transition */
65 __be16 flags; /* status flags */
66 __be16 state; /* state info */
68 /* The sequence options start here */
71 struct ip_vs_sync_conn_options {
72 struct ip_vs_seq in_seq; /* incoming seq. struct */
73 struct ip_vs_seq out_seq; /* outgoing seq. struct */
77 Sync Connection format (sync_conn)
80 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
81 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
82 | Type | Protocol | Ver. | Size |
83 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
91 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
93 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
95 | IP-Addresses (v4 or v6) |
97 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 | Param. Type | Param. Length | Param. data |
101 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
103 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104 | | Param Type | Param. Length |
105 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
107 | Last Param data should be padded for 32 bit alignment |
108 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
112 * Type 0, IPv4 sync connection format
114 struct ip_vs_sync_v4 {
116 __u8 protocol; /* Which protocol (TCP/UDP) */
117 __be16 ver_size; /* Version msb 4 bits */
118 /* Flags and state transition */
119 __be32 flags; /* status flags */
120 __be16 state; /* state info */
121 /* Protocol, addresses and port numbers */
125 __be32 fwmark; /* Firewall mark from skb */
126 __be32 timeout; /* cp timeout */
127 __be32 caddr; /* client address */
128 __be32 vaddr; /* virtual address */
129 __be32 daddr; /* destination address */
130 /* The sequence options start here */
131 /* PE data padded to 32bit alignment after seq. options */
134 * Type 2 messages IPv6
136 struct ip_vs_sync_v6 {
138 __u8 protocol; /* Which protocol (TCP/UDP) */
139 __be16 ver_size; /* Version msb 4 bits */
140 /* Flags and state transition */
141 __be32 flags; /* status flags */
142 __be16 state; /* state info */
143 /* Protocol, addresses and port numbers */
147 __be32 fwmark; /* Firewall mark from skb */
148 __be32 timeout; /* cp timeout */
149 struct in6_addr caddr; /* client address */
150 struct in6_addr vaddr; /* virtual address */
151 struct in6_addr daddr; /* destination address */
152 /* The sequence options start here */
153 /* PE data padded to 32bit alignment after seq. options */
156 union ip_vs_sync_conn {
157 struct ip_vs_sync_v4 v4;
158 struct ip_vs_sync_v6 v6;
161 /* Bits in Type field in above */
162 #define STYPE_INET6 0
163 #define STYPE_F_INET6 (1 << STYPE_INET6)
165 #define SVER_SHIFT 12 /* Shift to get version */
166 #define SVER_MASK 0x0fff /* Mask to strip version */
168 #define IPVS_OPT_SEQ_DATA 1
169 #define IPVS_OPT_PE_DATA 2
170 #define IPVS_OPT_PE_NAME 3
171 #define IPVS_OPT_PARAM 7
173 #define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1))
174 #define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1))
175 #define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1))
176 #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
178 struct ip_vs_sync_thread_data {
183 /* Version 0 definition of packet sizes */
184 #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0))
185 #define FULL_CONN_SIZE \
186 (sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
190 The master mulitcasts messages (Datagrams) to the backup load balancers
191 in the following format.
194 Note, first byte should be Zero, so ver 0 receivers will drop the packet.
197 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
198 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
199 | 0 | SyncID | Size |
200 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
201 | Count Conns | Version | Reserved, set to Zero |
202 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
204 | IPVS Sync Connection (1) |
205 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
209 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
211 | IPVS Sync Connection (n) |
212 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
216 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
217 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
218 | Count Conns | SyncID | Size |
219 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
220 | IPVS Sync Connection (1) |
223 #define SYNC_MESG_HEADER_LEN 4
224 #define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
226 /* Version 0 header */
227 struct ip_vs_sync_mesg {
232 /* ip_vs_sync_conn entries start here */
235 /* Version 1 header */
236 struct ip_vs_sync_mesg_v2 {
237 __u8 reserved; /* must be zero */
241 __s8 version; /* SYNC_PROTO_VER */
243 /* ip_vs_sync_conn entries start here */
246 /* the maximum length of sync (sending/receiving) message */
247 static int sync_send_mesg_maxlen;
248 static int sync_recv_mesg_maxlen;
250 struct ip_vs_sync_buff {
251 struct list_head list;
252 unsigned long firstuse;
254 /* pointers for the message data */
255 struct ip_vs_sync_mesg *mesg;
261 /* the sync_buff list head and the lock */
262 static LIST_HEAD(ip_vs_sync_queue);
263 static DEFINE_SPINLOCK(ip_vs_sync_lock);
265 /* current sync_buff for accepting new conn entries */
266 static struct ip_vs_sync_buff *curr_sb = NULL;
267 static DEFINE_SPINLOCK(curr_sb_lock);
269 /* ipvs sync daemon state */
270 volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
271 volatile int ip_vs_master_syncid = 0;
272 volatile int ip_vs_backup_syncid = 0;
274 /* multicast interface name */
275 char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
276 char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
278 /* sync daemon tasks */
279 static struct task_struct *sync_master_thread;
280 static struct task_struct *sync_backup_thread;
283 static struct sockaddr_in mcast_addr = {
284 .sin_family = AF_INET,
285 .sin_port = cpu_to_be16(IP_VS_SYNC_PORT),
286 .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
290 static inline struct ip_vs_sync_buff *sb_dequeue(void)
292 struct ip_vs_sync_buff *sb;
294 spin_lock_bh(&ip_vs_sync_lock);
295 if (list_empty(&ip_vs_sync_queue)) {
298 sb = list_entry(ip_vs_sync_queue.next,
299 struct ip_vs_sync_buff,
303 spin_unlock_bh(&ip_vs_sync_lock);
308 static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
310 struct ip_vs_sync_buff *sb;
312 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
315 if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
319 sb->mesg->nr_conns = 0;
320 sb->mesg->syncid = ip_vs_master_syncid;
322 sb->head = (unsigned char *)sb->mesg + 4;
323 sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
324 sb->firstuse = jiffies;
328 static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
334 static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
336 spin_lock(&ip_vs_sync_lock);
337 if (ip_vs_sync_state & IP_VS_STATE_MASTER)
338 list_add_tail(&sb->list, &ip_vs_sync_queue);
340 ip_vs_sync_buff_release(sb);
341 spin_unlock(&ip_vs_sync_lock);
345 * Get the current sync buffer if it has been created for more
346 * than the specified time or the specified time is zero.
348 static inline struct ip_vs_sync_buff *
349 get_curr_sync_buff(unsigned long time)
351 struct ip_vs_sync_buff *sb;
353 spin_lock_bh(&curr_sb_lock);
354 if (curr_sb && (time == 0 ||
355 time_before(jiffies - curr_sb->firstuse, time))) {
360 spin_unlock_bh(&curr_sb_lock);
366 * Add an ip_vs_conn information into the current sync_buff.
367 * Called by ip_vs_in.
369 void ip_vs_sync_conn(const struct ip_vs_conn *cp)
371 struct ip_vs_sync_mesg *m;
372 struct ip_vs_sync_conn_v0 *s;
375 spin_lock(&curr_sb_lock);
377 if (!(curr_sb=ip_vs_sync_buff_create())) {
378 spin_unlock(&curr_sb_lock);
379 pr_err("ip_vs_sync_buff_create failed.\n");
384 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
387 s = (struct ip_vs_sync_conn_v0 *)curr_sb->head;
390 s->protocol = cp->protocol;
391 s->cport = cp->cport;
392 s->vport = cp->vport;
393 s->dport = cp->dport;
394 s->caddr = cp->caddr.ip;
395 s->vaddr = cp->vaddr.ip;
396 s->daddr = cp->daddr.ip;
397 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
398 s->state = htons(cp->state);
399 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
400 struct ip_vs_sync_conn_options *opt =
401 (struct ip_vs_sync_conn_options *)&s[1];
402 memcpy(opt, &cp->in_seq, sizeof(*opt));
407 curr_sb->head += len;
409 /* check if there is a space for next one */
410 if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
411 sb_queue_tail(curr_sb);
414 spin_unlock(&curr_sb_lock);
416 /* synchronize its controller if it has */
418 ip_vs_sync_conn(cp->control);
422 ip_vs_conn_fill_param_sync(int af, int protocol,
423 const union nf_inet_addr *caddr, __be16 cport,
424 const union nf_inet_addr *vaddr, __be16 vport,
425 struct ip_vs_conn_param *p)
427 /* XXX: Need to take into account persistence engine */
428 ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p);
433 * Process received multicast message and create the corresponding
434 * ip_vs_conn entries.
436 static void ip_vs_process_message(char *buffer, const size_t buflen)
438 struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
439 struct ip_vs_sync_conn_v0 *s;
440 struct ip_vs_sync_conn_options *opt;
441 struct ip_vs_conn *cp;
442 struct ip_vs_protocol *pp;
443 struct ip_vs_dest *dest;
444 struct ip_vs_conn_param param;
448 if (buflen < sizeof(struct ip_vs_sync_mesg)) {
449 IP_VS_ERR_RL("sync message header too short\n");
453 /* Convert size back to host byte order */
454 m->size = ntohs(m->size);
456 if (buflen != m->size) {
457 IP_VS_ERR_RL("bogus sync message size\n");
461 /* SyncID sanity check */
462 if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
463 IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
468 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
469 for (i=0; i<m->nr_conns; i++) {
470 unsigned flags, state;
472 if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
473 IP_VS_ERR_RL("bogus conn in sync message\n");
476 s = (struct ip_vs_sync_conn_v0 *) p;
477 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
478 flags &= ~IP_VS_CONN_F_HASHED;
479 if (flags & IP_VS_CONN_F_SEQ_MASK) {
480 opt = (struct ip_vs_sync_conn_options *)&s[1];
482 if (p > buffer+buflen) {
483 IP_VS_ERR_RL("bogus conn options in sync message\n");
488 p += SIMPLE_CONN_SIZE;
491 state = ntohs(s->state);
492 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
493 pp = ip_vs_proto_get(s->protocol);
495 IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n",
499 if (state >= pp->num_states) {
500 IP_VS_DBG(2, "Invalid %s state %u in sync msg\n",
505 /* protocol in templates is not used for state/timeout */
508 IP_VS_DBG(2, "Invalid template state %u in sync msg\n",
514 if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
515 (union nf_inet_addr *)&s->caddr,
517 (union nf_inet_addr *)&s->vaddr,
519 pr_err("ip_vs_conn_fill_param_sync failed");
522 if (!(flags & IP_VS_CONN_F_TEMPLATE))
523 cp = ip_vs_conn_in_get(¶m);
525 cp = ip_vs_ct_in_get(¶m);
528 * Find the appropriate destination for the connection.
529 * If it is not found the connection will remain unbound
532 dest = ip_vs_find_dest(AF_INET,
533 (union nf_inet_addr *)&s->daddr,
535 (union nf_inet_addr *)&s->vaddr,
538 /* Set the approprite ativity flag */
539 if (s->protocol == IPPROTO_TCP) {
540 if (state != IP_VS_TCP_S_ESTABLISHED)
541 flags |= IP_VS_CONN_F_INACTIVE;
543 flags &= ~IP_VS_CONN_F_INACTIVE;
544 } else if (s->protocol == IPPROTO_SCTP) {
545 if (state != IP_VS_SCTP_S_ESTABLISHED)
546 flags |= IP_VS_CONN_F_INACTIVE;
548 flags &= ~IP_VS_CONN_F_INACTIVE;
550 cp = ip_vs_conn_new(¶m,
551 (union nf_inet_addr *)&s->daddr,
552 s->dport, flags, dest, 0);
554 atomic_dec(&dest->refcnt);
556 pr_err("ip_vs_conn_new failed\n");
559 } else if (!cp->dest) {
560 dest = ip_vs_try_bind_dest(cp);
562 atomic_dec(&dest->refcnt);
563 } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
564 (cp->state != state)) {
565 /* update active/inactive flag for the connection */
567 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
568 (state != IP_VS_TCP_S_ESTABLISHED)) {
569 atomic_dec(&dest->activeconns);
570 atomic_inc(&dest->inactconns);
571 cp->flags |= IP_VS_CONN_F_INACTIVE;
572 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
573 (state == IP_VS_TCP_S_ESTABLISHED)) {
574 atomic_inc(&dest->activeconns);
575 atomic_dec(&dest->inactconns);
576 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
578 } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
579 (cp->state != state)) {
581 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
582 (state != IP_VS_SCTP_S_ESTABLISHED)) {
583 atomic_dec(&dest->activeconns);
584 atomic_inc(&dest->inactconns);
585 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
590 memcpy(&cp->in_seq, opt, sizeof(*opt));
591 atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
593 cp->old_state = cp->state;
595 * We can not recover the right timeout for templates
596 * in all cases, we can not find the right fwmark
597 * virtual service. If needed, we can do it for
598 * non-fwmark persistent services.
600 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
601 cp->timeout = pp->timeout_table[state];
603 cp->timeout = (3*60*HZ);
610 * Setup loopback of outgoing multicasts on a sending socket
612 static void set_mcast_loop(struct sock *sk, u_char loop)
614 struct inet_sock *inet = inet_sk(sk);
616 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
618 inet->mc_loop = loop ? 1 : 0;
623 * Specify TTL for outgoing multicasts on a sending socket
625 static void set_mcast_ttl(struct sock *sk, u_char ttl)
627 struct inet_sock *inet = inet_sk(sk);
629 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
636 * Specifiy default interface for outgoing multicasts
638 static int set_mcast_if(struct sock *sk, char *ifname)
640 struct net_device *dev;
641 struct inet_sock *inet = inet_sk(sk);
643 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
646 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
650 inet->mc_index = dev->ifindex;
651 /* inet->mc_addr = 0; */
659 * Set the maximum length of sync message according to the
660 * specified interface's MTU.
662 static int set_sync_mesg_maxlen(int sync_state)
664 struct net_device *dev;
667 if (sync_state == IP_VS_STATE_MASTER) {
668 if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
671 num = (dev->mtu - sizeof(struct iphdr) -
672 sizeof(struct udphdr) -
673 SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
674 sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
675 SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
676 IP_VS_DBG(7, "setting the maximum length of sync sending "
677 "message %d.\n", sync_send_mesg_maxlen);
678 } else if (sync_state == IP_VS_STATE_BACKUP) {
679 if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
682 sync_recv_mesg_maxlen = dev->mtu -
683 sizeof(struct iphdr) - sizeof(struct udphdr);
684 IP_VS_DBG(7, "setting the maximum length of sync receiving "
685 "message %d.\n", sync_recv_mesg_maxlen);
693 * Join a multicast group.
694 * the group is specified by a class D multicast address 224.0.0.0/8
695 * in the in_addr structure passed in as a parameter.
698 join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
700 struct ip_mreqn mreq;
701 struct net_device *dev;
704 memset(&mreq, 0, sizeof(mreq));
705 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
707 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
709 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
712 mreq.imr_ifindex = dev->ifindex;
715 ret = ip_mc_join_group(sk, &mreq);
722 static int bind_mcastif_addr(struct socket *sock, char *ifname)
724 struct net_device *dev;
726 struct sockaddr_in sin;
728 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
731 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
733 pr_err("You probably need to specify IP address on "
734 "multicast interface.\n");
736 IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
739 /* Now bind the socket with the address of multicast interface */
740 sin.sin_family = AF_INET;
741 sin.sin_addr.s_addr = addr;
744 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
748 * Set up sending multicast socket over UDP
750 static struct socket * make_send_sock(void)
755 /* First create a socket */
756 result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
758 pr_err("Error during creation of socket; terminating\n");
759 return ERR_PTR(result);
762 result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
764 pr_err("Error setting outbound mcast interface\n");
768 set_mcast_loop(sock->sk, 0);
769 set_mcast_ttl(sock->sk, 1);
771 result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
773 pr_err("Error binding address of the mcast interface\n");
777 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
778 sizeof(struct sockaddr), 0);
780 pr_err("Error connecting to the multicast addr\n");
788 return ERR_PTR(result);
793 * Set up receiving multicast socket over UDP
795 static struct socket * make_receive_sock(void)
800 /* First create a socket */
801 result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
803 pr_err("Error during creation of socket; terminating\n");
804 return ERR_PTR(result);
807 /* it is equivalent to the REUSEADDR option in user-space */
808 sock->sk->sk_reuse = 1;
810 result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
811 sizeof(struct sockaddr));
813 pr_err("Error binding to the multicast addr\n");
817 /* join the multicast group */
818 result = join_mcast_group(sock->sk,
819 (struct in_addr *) &mcast_addr.sin_addr,
820 ip_vs_backup_mcast_ifn);
822 pr_err("Error joining to the multicast group\n");
830 return ERR_PTR(result);
835 ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
837 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
842 iov.iov_base = (void *)buffer;
843 iov.iov_len = length;
845 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
852 ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
858 /* Put size in network byte order */
859 msg->size = htons(msg->size);
861 if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
862 pr_err("ip_vs_send_async error\n");
866 ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
868 struct msghdr msg = {NULL,};
874 /* Receive a packet */
875 iov.iov_base = buffer;
876 iov.iov_len = (size_t)buflen;
878 len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
888 static int sync_thread_master(void *data)
890 struct ip_vs_sync_thread_data *tinfo = data;
891 struct ip_vs_sync_buff *sb;
893 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
895 ip_vs_master_mcast_ifn, ip_vs_master_syncid);
897 while (!kthread_should_stop()) {
898 while ((sb = sb_dequeue())) {
899 ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
900 ip_vs_sync_buff_release(sb);
903 /* check if entries stay in curr_sb for 2 seconds */
904 sb = get_curr_sync_buff(2 * HZ);
906 ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
907 ip_vs_sync_buff_release(sb);
910 schedule_timeout_interruptible(HZ);
913 /* clean up the sync_buff queue */
914 while ((sb=sb_dequeue())) {
915 ip_vs_sync_buff_release(sb);
918 /* clean up the current sync_buff */
919 if ((sb = get_curr_sync_buff(0))) {
920 ip_vs_sync_buff_release(sb);
923 /* release the sending multicast socket */
924 sock_release(tinfo->sock);
931 static int sync_thread_backup(void *data)
933 struct ip_vs_sync_thread_data *tinfo = data;
936 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
938 ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
940 while (!kthread_should_stop()) {
941 wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
942 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
943 || kthread_should_stop());
945 /* do we have data now? */
946 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
947 len = ip_vs_receive(tinfo->sock, tinfo->buf,
948 sync_recv_mesg_maxlen);
950 pr_err("receiving message error\n");
954 /* disable bottom half, because it accesses the data
955 shared by softirq while getting/creating conns */
957 ip_vs_process_message(tinfo->buf, len);
962 /* release the sending multicast socket */
963 sock_release(tinfo->sock);
971 int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
973 struct ip_vs_sync_thread_data *tinfo;
974 struct task_struct **realtask, *task;
976 char *name, *buf = NULL;
977 int (*threadfn)(void *data);
978 int result = -ENOMEM;
980 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
981 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
982 sizeof(struct ip_vs_sync_conn_v0));
984 if (state == IP_VS_STATE_MASTER) {
985 if (sync_master_thread)
988 strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
989 sizeof(ip_vs_master_mcast_ifn));
990 ip_vs_master_syncid = syncid;
991 realtask = &sync_master_thread;
992 name = "ipvs_syncmaster";
993 threadfn = sync_thread_master;
994 sock = make_send_sock();
995 } else if (state == IP_VS_STATE_BACKUP) {
996 if (sync_backup_thread)
999 strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
1000 sizeof(ip_vs_backup_mcast_ifn));
1001 ip_vs_backup_syncid = syncid;
1002 realtask = &sync_backup_thread;
1003 name = "ipvs_syncbackup";
1004 threadfn = sync_thread_backup;
1005 sock = make_receive_sock();
1011 result = PTR_ERR(sock);
1015 set_sync_mesg_maxlen(state);
1016 if (state == IP_VS_STATE_BACKUP) {
1017 buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
1022 tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
1029 task = kthread_run(threadfn, tinfo, name);
1031 result = PTR_ERR(task);
1035 /* mark as active */
1037 ip_vs_sync_state |= state;
1039 /* increase the module use count */
1040 ip_vs_use_count_inc();
1055 int stop_sync_thread(int state)
1057 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
1059 if (state == IP_VS_STATE_MASTER) {
1060 if (!sync_master_thread)
1063 pr_info("stopping master sync thread %d ...\n",
1064 task_pid_nr(sync_master_thread));
1067 * The lock synchronizes with sb_queue_tail(), so that we don't
1068 * add sync buffers to the queue, when we are already in
1069 * progress of stopping the master sync daemon.
1072 spin_lock_bh(&ip_vs_sync_lock);
1073 ip_vs_sync_state &= ~IP_VS_STATE_MASTER;
1074 spin_unlock_bh(&ip_vs_sync_lock);
1075 kthread_stop(sync_master_thread);
1076 sync_master_thread = NULL;
1077 } else if (state == IP_VS_STATE_BACKUP) {
1078 if (!sync_backup_thread)
1081 pr_info("stopping backup sync thread %d ...\n",
1082 task_pid_nr(sync_backup_thread));
1084 ip_vs_sync_state &= ~IP_VS_STATE_BACKUP;
1085 kthread_stop(sync_backup_thread);
1086 sync_backup_thread = NULL;
1091 /* decrease the module use count */
1092 ip_vs_use_count_dec();