]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - net/ipv4/tcp_input.c
Merge tag 'dmaengine-3.17' of git://git.kernel.org/pub/scm/linux/kernel/git/djbw...
[karo-tx-linux.git] / net / ipv4 / tcp_input.c
index b5c23756965ae338d1dfed57ca44be700fd2f148..0185eea59342a9318ba42a9eb2f77da900b53e91 100644 (file)
@@ -73,7 +73,7 @@
 #include <net/inet_common.h>
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
-#include <net/netdma.h>
+#include <linux/errqueue.h>
 
 int sysctl_tcp_timestamps __read_mostly = 1;
 int sysctl_tcp_window_scaling __read_mostly = 1;
@@ -1106,7 +1106,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
        }
 
        /* D-SACK for already forgotten data... Do dumb counting. */
-       if (dup_sack && tp->undo_marker && tp->undo_retrans &&
+       if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
            !after(end_seq_0, prior_snd_una) &&
            after(end_seq_0, tp->undo_marker))
                tp->undo_retrans--;
@@ -1187,7 +1187,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
 
        /* Account D-SACK for retransmitted packet. */
        if (dup_sack && (sacked & TCPCB_RETRANS)) {
-               if (tp->undo_marker && tp->undo_retrans &&
+               if (tp->undo_marker && tp->undo_retrans > 0 &&
                    after(end_seq, tp->undo_marker))
                        tp->undo_retrans--;
                if (sacked & TCPCB_SACKED_ACKED)
@@ -1893,7 +1893,7 @@ static void tcp_clear_retrans_partial(struct tcp_sock *tp)
        tp->lost_out = 0;
 
        tp->undo_marker = 0;
-       tp->undo_retrans = 0;
+       tp->undo_retrans = -1;
 }
 
 void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1904,16 +1904,17 @@ void tcp_clear_retrans(struct tcp_sock *tp)
        tp->sacked_out = 0;
 }
 
-/* Enter Loss state. If "how" is not zero, forget all SACK information
+/* Enter Loss state. If we detect SACK reneging, forget all SACK information
  * and reset tags completely, otherwise preserve SACKs. If receiver
  * dropped its ofo queue, we will know this due to reneging detection.
  */
-void tcp_enter_loss(struct sock *sk, int how)
+void tcp_enter_loss(struct sock *sk)
 {
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        bool new_recovery = false;
+       bool is_reneg;                  /* is receiver reneging on SACKs? */
 
        /* Reduce ssthresh if it has not yet been made inside this window. */
        if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1934,7 +1935,11 @@ void tcp_enter_loss(struct sock *sk, int how)
                tcp_reset_reno_sack(tp);
 
        tp->undo_marker = tp->snd_una;
-       if (how) {
+
+       skb = tcp_write_queue_head(sk);
+       is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
+       if (is_reneg) {
+               NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
                tp->sacked_out = 0;
                tp->fackets_out = 0;
        }
@@ -1948,7 +1953,7 @@ void tcp_enter_loss(struct sock *sk, int how)
                        tp->undo_marker = 0;
 
                TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
-               if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
+               if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
                        TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
                        TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
                        tp->lost_out += tcp_skb_pcount(skb);
@@ -1981,19 +1986,21 @@ void tcp_enter_loss(struct sock *sk, int how)
  * remembered SACKs do not reflect real state of receiver i.e.
  * receiver _host_ is heavily congested (or buggy).
  *
- * Do processing similar to RTO timeout.
+ * To avoid big spurious retransmission bursts due to transient SACK
+ * scoreboard oddities that look like reneging, we give the receiver a
+ * little time (max(RTT/2, 10ms)) to send us some more ACKs that will
+ * restore sanity to the SACK scoreboard. If the apparent reneging
+ * persists until this RTO then we'll clear the SACK scoreboard.
  */
 static bool tcp_check_sack_reneging(struct sock *sk, int flag)
 {
        if (flag & FLAG_SACK_RENEGING) {
-               struct inet_connection_sock *icsk = inet_csk(sk);
-               NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+               struct tcp_sock *tp = tcp_sk(sk);
+               unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
+                                         msecs_to_jiffies(10));
 
-               tcp_enter_loss(sk, 1);
-               icsk->icsk_retransmits++;
-               tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-                                         icsk->icsk_rto, TCP_RTO_MAX);
+                                         delay, TCP_RTO_MAX);
                return true;
        }
        return false;
@@ -2475,7 +2482,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
  *     losses and/or application stalls), do not perform any further cwnd
  *     reductions, but instead slow start up to ssthresh.
  */
-static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
+static void tcp_init_cwnd_reduction(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
 
@@ -2485,8 +2492,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
        tp->prior_cwnd = tp->snd_cwnd;
        tp->prr_delivered = 0;
        tp->prr_out = 0;
-       if (set_ssthresh)
-               tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+       tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
        TCP_ECN_queue_cwr(tp);
 }
 
@@ -2528,14 +2534,14 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
 }
 
 /* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
-void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
+void tcp_enter_cwr(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
 
        tp->prior_ssthresh = 0;
        if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
                tp->undo_marker = 0;
-               tcp_init_cwnd_reduction(sk, set_ssthresh);
+               tcp_init_cwnd_reduction(sk);
                tcp_set_ca_state(sk, TCP_CA_CWR);
        }
 }
@@ -2564,7 +2570,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
                tp->retrans_stamp = 0;
 
        if (flag & FLAG_ECE)
-               tcp_enter_cwr(sk, 1);
+               tcp_enter_cwr(sk);
 
        if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
                tcp_try_keep_open(sk);
@@ -2665,12 +2671,12 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 
        tp->prior_ssthresh = 0;
        tp->undo_marker = tp->snd_una;
-       tp->undo_retrans = tp->retrans_out;
+       tp->undo_retrans = tp->retrans_out ? : -1;
 
        if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
                if (!ece_ack)
                        tp->prior_ssthresh = tcp_current_ssthresh(sk);
-               tcp_init_cwnd_reduction(sk, true);
+               tcp_init_cwnd_reduction(sk);
        }
        tcp_set_ca_state(sk, TCP_CA_Recovery);
 }
@@ -2680,7 +2686,6 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
  */
 static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
 {
-       struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        bool recovered = !before(tp->snd_una, tp->high_seq);
 
@@ -2706,12 +2711,9 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
 
        if (recovered) {
                /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
-               icsk->icsk_retransmits = 0;
                tcp_try_undo_recovery(sk);
                return;
        }
-       if (flag & FLAG_DATA_ACKED)
-               icsk->icsk_retransmits = 0;
        if (tcp_is_reno(tp)) {
                /* A Reno DUPACK means new data in F-RTO step 2.b above are
                 * delivered. Lower inflight to clock out (re)tranmissions.
@@ -3043,10 +3045,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
        first_ackt.v64 = 0;
 
        while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
+               struct skb_shared_info *shinfo = skb_shinfo(skb);
                struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
                u8 sacked = scb->sacked;
                u32 acked_pcount;
 
+               if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
+                   between(shinfo->tskey, prior_snd_una, tp->snd_una - 1))
+                       __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+
                /* Determine how many packets and what bytes were acked, tso and else */
                if (after(scb->end_seq, tp->snd_una)) {
                        if (tcp_skb_pcount(skb) == 1 ||
@@ -3346,7 +3353,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
                tp->tlp_high_seq = 0;
                /* Don't reduce cwnd if DSACK arrives for TLP retrans. */
                if (!(flag & FLAG_DSACKING_ACK)) {
-                       tcp_init_cwnd_reduction(sk, true);
+                       tcp_init_cwnd_reduction(sk);
                        tcp_set_ca_state(sk, TCP_CA_CWR);
                        tcp_end_cwnd_reduction(sk);
                        tcp_try_keep_open(sk);
@@ -3393,8 +3400,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
                tcp_rearm_rto(sk);
 
-       if (after(ack, prior_snd_una))
+       if (after(ack, prior_snd_una)) {
                flag |= FLAG_SND_UNA_ADVANCED;
+               icsk->icsk_retransmits = 0;
+       }
 
        prior_fackets = tp->fackets_out;
 
@@ -4941,53 +4950,6 @@ static inline bool tcp_checksum_complete_user(struct sock *sk,
               __tcp_checksum_complete_user(sk, skb);
 }
 
-#ifdef CONFIG_NET_DMA
-static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
-                                 int hlen)
-{
-       struct tcp_sock *tp = tcp_sk(sk);
-       int chunk = skb->len - hlen;
-       int dma_cookie;
-       bool copied_early = false;
-
-       if (tp->ucopy.wakeup)
-               return false;
-
-       if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-               tp->ucopy.dma_chan = net_dma_find_channel();
-
-       if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
-
-               dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
-                                                        skb, hlen,
-                                                        tp->ucopy.iov, chunk,
-                                                        tp->ucopy.pinned_list);
-
-               if (dma_cookie < 0)
-                       goto out;
-
-               tp->ucopy.dma_cookie = dma_cookie;
-               copied_early = true;
-
-               tp->ucopy.len -= chunk;
-               tp->copied_seq += chunk;
-               tcp_rcv_space_adjust(sk);
-
-               if ((tp->ucopy.len == 0) ||
-                   (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||
-                   (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
-                       tp->ucopy.wakeup = 1;
-                       sk->sk_data_ready(sk);
-               }
-       } else if (chunk > 0) {
-               tp->ucopy.wakeup = 1;
-               sk->sk_data_ready(sk);
-       }
-out:
-       return copied_early;
-}
-#endif /* CONFIG_NET_DMA */
-
 /* Does PAWS and seqno based validation of an incoming segment, flags will
  * play significant role here.
  */
@@ -5167,27 +5129,15 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                        }
                } else {
                        int eaten = 0;
-                       int copied_early = 0;
                        bool fragstolen = false;
 
-                       if (tp->copied_seq == tp->rcv_nxt &&
-                           len - tcp_header_len <= tp->ucopy.len) {
-#ifdef CONFIG_NET_DMA
-                               if (tp->ucopy.task == current &&
-                                   sock_owned_by_user(sk) &&
-                                   tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
-                                       copied_early = 1;
-                                       eaten = 1;
-                               }
-#endif
-                               if (tp->ucopy.task == current &&
-                                   sock_owned_by_user(sk) && !copied_early) {
-                                       __set_current_state(TASK_RUNNING);
+                       if (tp->ucopy.task == current &&
+                           tp->copied_seq == tp->rcv_nxt &&
+                           len - tcp_header_len <= tp->ucopy.len &&
+                           sock_owned_by_user(sk)) {
+                               __set_current_state(TASK_RUNNING);
 
-                                       if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
-                                               eaten = 1;
-                               }
-                               if (eaten) {
+                               if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
                                        /* Predicted packet is in window by definition.
                                         * seq == rcv_nxt and rcv_wup <= rcv_nxt.
                                         * Hence, check seq<=rcv_wup reduces to:
@@ -5203,9 +5153,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                        __skb_pull(skb, tcp_header_len);
                                        tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
                                        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
+                                       eaten = 1;
                                }
-                               if (copied_early)
-                                       tcp_cleanup_rbuf(sk, skb->len);
                        }
                        if (!eaten) {
                                if (tcp_checksum_complete_user(sk, skb))
@@ -5242,14 +5191,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                        goto no_ack;
                        }
 
-                       if (!copied_early || tp->rcv_nxt != tp->rcv_wup)
-                               __tcp_ack_snd_check(sk, 0);
+                       __tcp_ack_snd_check(sk, 0);
 no_ack:
-#ifdef CONFIG_NET_DMA
-                       if (copied_early)
-                               __skb_queue_tail(&sk->sk_async_wait_queue, skb);
-                       else
-#endif
                        if (eaten)
                                kfree_skb_partial(skb, fragstolen);
                        sk->sk_data_ready(sk);
@@ -5877,3 +5820,156 @@ discard:
        return 0;
 }
 EXPORT_SYMBOL(tcp_rcv_state_process);
+
+static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
+{
+       struct inet_request_sock *ireq = inet_rsk(req);
+
+       if (family == AF_INET)
+               LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
+                              &ireq->ir_rmt_addr, port);
+#if IS_ENABLED(CONFIG_IPV6)
+       else if (family == AF_INET6)
+               LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),
+                              &ireq->ir_v6_rmt_addr, port);
+#endif
+}
+
+int tcp_conn_request(struct request_sock_ops *rsk_ops,
+                    const struct tcp_request_sock_ops *af_ops,
+                    struct sock *sk, struct sk_buff *skb)
+{
+       struct tcp_options_received tmp_opt;
+       struct request_sock *req;
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct dst_entry *dst = NULL;
+       __u32 isn = TCP_SKB_CB(skb)->when;
+       bool want_cookie = false, fastopen;
+       struct flowi fl;
+       struct tcp_fastopen_cookie foc = { .len = -1 };
+       int err;
+
+
+       /* TW buckets are converted to open requests without
+        * limitations, they conserve resources and peer is
+        * evidently real one.
+        */
+       if ((sysctl_tcp_syncookies == 2 ||
+            inet_csk_reqsk_queue_is_full(sk)) && !isn) {
+               want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
+               if (!want_cookie)
+                       goto drop;
+       }
+
+
+       /* Accept backlog is full. If we have already queued enough
+        * of warm entries in syn queue, drop request. It is better than
+        * clogging syn queue with openreqs with exponentially increasing
+        * timeout.
+        */
+       if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
+               NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+               goto drop;
+       }
+
+       req = inet_reqsk_alloc(rsk_ops);
+       if (!req)
+               goto drop;
+
+       tcp_rsk(req)->af_specific = af_ops;
+
+       tcp_clear_options(&tmp_opt);
+       tmp_opt.mss_clamp = af_ops->mss_clamp;
+       tmp_opt.user_mss  = tp->rx_opt.user_mss;
+       tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
+
+       if (want_cookie && !tmp_opt.saw_tstamp)
+               tcp_clear_options(&tmp_opt);
+
+       tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
+       tcp_openreq_init(req, &tmp_opt, skb, sk);
+
+       af_ops->init_req(req, sk, skb);
+
+       if (security_inet_conn_request(sk, skb, req))
+               goto drop_and_free;
+
+       if (!want_cookie || tmp_opt.tstamp_ok)
+               TCP_ECN_create_request(req, skb, sock_net(sk));
+
+       if (want_cookie) {
+               isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
+               req->cookie_ts = tmp_opt.tstamp_ok;
+       } else if (!isn) {
+               /* VJ's idea. We save last timestamp seen
+                * from the destination in peer table, when entering
+                * state TIME-WAIT, and check against it before
+                * accepting new connection request.
+                *
+                * If "isn" is not zero, this request hit alive
+                * timewait bucket, so that all the necessary checks
+                * are made in the function processing timewait state.
+                */
+               if (tcp_death_row.sysctl_tw_recycle) {
+                       bool strict;
+
+                       dst = af_ops->route_req(sk, &fl, req, &strict);
+
+                       if (dst && strict &&
+                           !tcp_peer_is_proven(req, dst, true,
+                                               tmp_opt.saw_tstamp)) {
+                               NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
+                               goto drop_and_release;
+                       }
+               }
+               /* Kill the following clause, if you dislike this way. */
+               else if (!sysctl_tcp_syncookies &&
+                        (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+                         (sysctl_max_syn_backlog >> 2)) &&
+                        !tcp_peer_is_proven(req, dst, false,
+                                            tmp_opt.saw_tstamp)) {
+                       /* Without syncookies last quarter of
+                        * backlog is filled with destinations,
+                        * proven to be alive.
+                        * It means that we continue to communicate
+                        * to destinations, already remembered
+                        * to the moment of synflood.
+                        */
+                       pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
+                                   rsk_ops->family);
+                       goto drop_and_release;
+               }
+
+               isn = af_ops->init_seq(skb);
+       }
+       if (!dst) {
+               dst = af_ops->route_req(sk, &fl, req, NULL);
+               if (!dst)
+                       goto drop_and_free;
+       }
+
+       tcp_rsk(req)->snt_isn = isn;
+       tcp_openreq_init_rwin(req, sk, dst);
+       fastopen = !want_cookie &&
+                  tcp_try_fastopen(sk, skb, req, &foc, dst);
+       err = af_ops->send_synack(sk, dst, &fl, req,
+                                 skb_get_queue_mapping(skb), &foc);
+       if (!fastopen) {
+               if (err || want_cookie)
+                       goto drop_and_free;
+
+               tcp_rsk(req)->listener = NULL;
+               af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+       }
+
+       return 0;
+
+drop_and_release:
+       dst_release(dst);
+drop_and_free:
+       reqsk_free(req);
+drop:
+       NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+       return 0;
+}
+EXPORT_SYMBOL(tcp_conn_request);