]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - net/ipv4/tcp_input.c
Merge tag 'dmaengine-3.17' of git://git.kernel.org/pub/scm/linux/kernel/git/djbw...
[karo-tx-linux.git] / net / ipv4 / tcp_input.c
index 1342e9851f97c83050b4c8674be0a5096db7f954..0185eea59342a9318ba42a9eb2f77da900b53e91 100644 (file)
@@ -73,6 +73,7 @@
 #include <net/inet_common.h>
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
+#include <linux/errqueue.h>
 
 int sysctl_tcp_timestamps __read_mostly = 1;
 int sysctl_tcp_window_scaling __read_mostly = 1;
@@ -666,11 +667,11 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
  * To save cycles in the RFC 1323 implementation it was better to break
  * it up into three procedures. -- erics
  */
-static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
+static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-       long m = mrtt; /* RTT */
-       u32 srtt = tp->srtt;
+       long m = mrtt_us; /* RTT */
+       u32 srtt = tp->srtt_us;
 
        /*      The following amusing code comes from Jacobson's
         *      article in SIGCOMM '88.  Note that rtt and mdev
@@ -693,7 +694,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
                srtt += m;              /* rtt = 7/8 rtt + 1/8 new */
                if (m < 0) {
                        m = -m;         /* m is now abs(error) */
-                       m -= (tp->mdev >> 2);   /* similar update on mdev */
+                       m -= (tp->mdev_us >> 2);   /* similar update on mdev */
                        /* This is similar to one of Eifel findings.
                         * Eifel blocks mdev updates when rtt decreases.
                         * This solution is a bit different: we use finer gain
@@ -705,28 +706,29 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
                        if (m > 0)
                                m >>= 3;
                } else {
-                       m -= (tp->mdev >> 2);   /* similar update on mdev */
+                       m -= (tp->mdev_us >> 2);   /* similar update on mdev */
                }
-               tp->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
-               if (tp->mdev > tp->mdev_max) {
-                       tp->mdev_max = tp->mdev;
-                       if (tp->mdev_max > tp->rttvar)
-                               tp->rttvar = tp->mdev_max;
+               tp->mdev_us += m;               /* mdev = 3/4 mdev + 1/4 new */
+               if (tp->mdev_us > tp->mdev_max_us) {
+                       tp->mdev_max_us = tp->mdev_us;
+                       if (tp->mdev_max_us > tp->rttvar_us)
+                               tp->rttvar_us = tp->mdev_max_us;
                }
                if (after(tp->snd_una, tp->rtt_seq)) {
-                       if (tp->mdev_max < tp->rttvar)
-                               tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2;
+                       if (tp->mdev_max_us < tp->rttvar_us)
+                               tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
                        tp->rtt_seq = tp->snd_nxt;
-                       tp->mdev_max = tcp_rto_min(sk);
+                       tp->mdev_max_us = tcp_rto_min_us(sk);
                }
        } else {
                /* no previous measure. */
                srtt = m << 3;          /* take the measured time to be rtt */
-               tp->mdev = m << 1;      /* make sure rto = 3*rtt */
-               tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
+               tp->mdev_us = m << 1;   /* make sure rto = 3*rtt */
+               tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
+               tp->mdev_max_us = tp->rttvar_us;
                tp->rtt_seq = tp->snd_nxt;
        }
-       tp->srtt = max(1U, srtt);
+       tp->srtt_us = max(1U, srtt);
 }
 
 /* Set the sk_pacing_rate to allow proper sizing of TSO packets.
@@ -741,20 +743,12 @@ static void tcp_update_pacing_rate(struct sock *sk)
        u64 rate;
 
        /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
-       rate = (u64)tp->mss_cache * 2 * (HZ << 3);
+       rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
 
        rate *= max(tp->snd_cwnd, tp->packets_out);
 
-       /* Correction for small srtt and scheduling constraints.
-        * For small rtt, consider noise is too high, and use
-        * the minimal value (srtt = 1 -> 125 us for HZ=1000)
-        *
-        * We probably need usec resolution in the future.
-        * Note: This also takes care of possible srtt=0 case,
-        * when tcp_rtt_estimator() was not yet called.
-        */
-       if (tp->srtt > 8 + 2)
-               do_div(rate, tp->srtt);
+       if (likely(tp->srtt_us))
+               do_div(rate, tp->srtt_us);
 
        /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
         * without any lock. We want to make sure compiler wont store
@@ -1112,7 +1106,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
        }
 
        /* D-SACK for already forgotten data... Do dumb counting. */
-       if (dup_sack && tp->undo_marker && tp->undo_retrans &&
+       if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
            !after(end_seq_0, prior_snd_una) &&
            after(end_seq_0, tp->undo_marker))
                tp->undo_retrans--;
@@ -1121,10 +1115,10 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
 }
 
 struct tcp_sacktag_state {
-       int reord;
-       int fack_count;
-       int flag;
-       s32 rtt; /* RTT measured by SACKing never-retransmitted data */
+       int     reord;
+       int     fack_count;
+       long    rtt_us; /* RTT measured by SACKing never-retransmitted data */
+       int     flag;
 };
 
 /* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1168,12 +1162,12 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
                        unsigned int new_len = (pkt_len / mss) * mss;
                        if (!in_sack && new_len < pkt_len) {
                                new_len += mss;
-                               if (new_len > skb->len)
+                               if (new_len >= skb->len)
                                        return 0;
                        }
                        pkt_len = new_len;
                }
-               err = tcp_fragment(sk, skb, pkt_len, mss);
+               err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
                if (err < 0)
                        return err;
        }
@@ -1185,14 +1179,15 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
 static u8 tcp_sacktag_one(struct sock *sk,
                          struct tcp_sacktag_state *state, u8 sacked,
                          u32 start_seq, u32 end_seq,
-                         int dup_sack, int pcount, u32 xmit_time)
+                         int dup_sack, int pcount,
+                         const struct skb_mstamp *xmit_time)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        int fack_count = state->fack_count;
 
        /* Account D-SACK for retransmitted packet. */
        if (dup_sack && (sacked & TCPCB_RETRANS)) {
-               if (tp->undo_marker && tp->undo_retrans &&
+               if (tp->undo_marker && tp->undo_retrans > 0 &&
                    after(end_seq, tp->undo_marker))
                        tp->undo_retrans--;
                if (sacked & TCPCB_SACKED_ACKED)
@@ -1226,8 +1221,13 @@ static u8 tcp_sacktag_one(struct sock *sk,
                                if (!after(end_seq, tp->high_seq))
                                        state->flag |= FLAG_ORIG_SACK_ACKED;
                                /* Pick the earliest sequence sacked for RTT */
-                               if (state->rtt < 0)
-                                       state->rtt = tcp_time_stamp - xmit_time;
+                               if (state->rtt_us < 0) {
+                                       struct skb_mstamp now;
+
+                                       skb_mstamp_get(&now);
+                                       state->rtt_us = skb_mstamp_us_delta(&now,
+                                                               xmit_time);
+                               }
                        }
 
                        if (sacked & TCPCB_LOST) {
@@ -1286,7 +1286,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
         */
        tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
                        start_seq, end_seq, dup_sack, pcount,
-                       TCP_SKB_CB(skb)->when);
+                       &skb->skb_mstamp);
 
        if (skb == tp->lost_skb_hint)
                tp->lost_cnt_hint += pcount;
@@ -1564,7 +1564,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
                                                TCP_SKB_CB(skb)->end_seq,
                                                dup_sack,
                                                tcp_skb_pcount(skb),
-                                               TCP_SKB_CB(skb)->when);
+                                               &skb->skb_mstamp);
 
                        if (!before(TCP_SKB_CB(skb)->seq,
                                    tcp_highest_sack_seq(tp)))
@@ -1621,7 +1621,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl
 
 static int
 tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
-                       u32 prior_snd_una, s32 *sack_rtt)
+                       u32 prior_snd_una, long *sack_rtt_us)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        const unsigned char *ptr = (skb_transport_header(ack_skb) +
@@ -1639,7 +1639,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 
        state.flag = 0;
        state.reord = tp->packets_out;
-       state.rtt = -1;
+       state.rtt_us = -1L;
 
        if (!tp->sacked_out) {
                if (WARN_ON(tp->fackets_out))
@@ -1823,7 +1823,7 @@ out:
        WARN_ON((int)tp->retrans_out < 0);
        WARN_ON((int)tcp_packets_in_flight(tp) < 0);
 #endif
-       *sack_rtt = state.rtt;
+       *sack_rtt_us = state.rtt_us;
        return state.flag;
 }
 
@@ -1893,7 +1893,7 @@ static void tcp_clear_retrans_partial(struct tcp_sock *tp)
        tp->lost_out = 0;
 
        tp->undo_marker = 0;
-       tp->undo_retrans = 0;
+       tp->undo_retrans = -1;
 }
 
 void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1904,16 +1904,17 @@ void tcp_clear_retrans(struct tcp_sock *tp)
        tp->sacked_out = 0;
 }
 
-/* Enter Loss state. If "how" is not zero, forget all SACK information
+/* Enter Loss state. If we detect SACK reneging, forget all SACK information
  * and reset tags completely, otherwise preserve SACKs. If receiver
  * dropped its ofo queue, we will know this due to reneging detection.
  */
-void tcp_enter_loss(struct sock *sk, int how)
+void tcp_enter_loss(struct sock *sk)
 {
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        bool new_recovery = false;
+       bool is_reneg;                  /* is receiver reneging on SACKs? */
 
        /* Reduce ssthresh if it has not yet been made inside this window. */
        if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1934,7 +1935,11 @@ void tcp_enter_loss(struct sock *sk, int how)
                tcp_reset_reno_sack(tp);
 
        tp->undo_marker = tp->snd_una;
-       if (how) {
+
+       skb = tcp_write_queue_head(sk);
+       is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
+       if (is_reneg) {
+               NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
                tp->sacked_out = 0;
                tp->fackets_out = 0;
        }
@@ -1948,7 +1953,7 @@ void tcp_enter_loss(struct sock *sk, int how)
                        tp->undo_marker = 0;
 
                TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
-               if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
+               if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
                        TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
                        TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
                        tp->lost_out += tcp_skb_pcount(skb);
@@ -1981,19 +1986,21 @@ void tcp_enter_loss(struct sock *sk, int how)
  * remembered SACKs do not reflect real state of receiver i.e.
  * receiver _host_ is heavily congested (or buggy).
  *
- * Do processing similar to RTO timeout.
+ * To avoid big spurious retransmission bursts due to transient SACK
+ * scoreboard oddities that look like reneging, we give the receiver a
+ * little time (max(RTT/2, 10ms)) to send us some more ACKs that will
+ * restore sanity to the SACK scoreboard. If the apparent reneging
+ * persists until this RTO then we'll clear the SACK scoreboard.
  */
 static bool tcp_check_sack_reneging(struct sock *sk, int flag)
 {
        if (flag & FLAG_SACK_RENEGING) {
-               struct inet_connection_sock *icsk = inet_csk(sk);
-               NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+               struct tcp_sock *tp = tcp_sk(sk);
+               unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
+                                         msecs_to_jiffies(10));
 
-               tcp_enter_loss(sk, 1);
-               icsk->icsk_retransmits++;
-               tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-                                         icsk->icsk_rto, TCP_RTO_MAX);
+                                         delay, TCP_RTO_MAX);
                return true;
        }
        return false;
@@ -2034,10 +2041,12 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
         * available, or RTO is scheduled to fire first.
         */
        if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
-           (flag & FLAG_ECE) || !tp->srtt)
+           (flag & FLAG_ECE) || !tp->srtt_us)
                return false;
 
-       delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
+       delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
+                   msecs_to_jiffies(2));
+
        if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
                return false;
 
@@ -2239,7 +2248,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
                                break;
 
                        mss = skb_shinfo(skb)->gso_size;
-                       err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss);
+                       err = tcp_fragment(sk, skb, (packets - oldcnt) * mss,
+                                          mss, GFP_ATOMIC);
                        if (err < 0)
                                break;
                        cnt = packets;
@@ -2472,7 +2482,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
  *     losses and/or application stalls), do not perform any further cwnd
  *     reductions, but instead slow start up to ssthresh.
  */
-static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
+static void tcp_init_cwnd_reduction(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
 
@@ -2482,8 +2492,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
        tp->prior_cwnd = tp->snd_cwnd;
        tp->prr_delivered = 0;
        tp->prr_out = 0;
-       if (set_ssthresh)
-               tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+       tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
        TCP_ECN_queue_cwr(tp);
 }
 
@@ -2525,14 +2534,14 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
 }
 
 /* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
-void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
+void tcp_enter_cwr(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
 
        tp->prior_ssthresh = 0;
        if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
                tp->undo_marker = 0;
-               tcp_init_cwnd_reduction(sk, set_ssthresh);
+               tcp_init_cwnd_reduction(sk);
                tcp_set_ca_state(sk, TCP_CA_CWR);
        }
 }
@@ -2561,7 +2570,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
                tp->retrans_stamp = 0;
 
        if (flag & FLAG_ECE)
-               tcp_enter_cwr(sk, 1);
+               tcp_enter_cwr(sk);
 
        if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
                tcp_try_keep_open(sk);
@@ -2662,12 +2671,12 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 
        tp->prior_ssthresh = 0;
        tp->undo_marker = tp->snd_una;
-       tp->undo_retrans = tp->retrans_out;
+       tp->undo_retrans = tp->retrans_out ? : -1;
 
        if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
                if (!ece_ack)
                        tp->prior_ssthresh = tcp_current_ssthresh(sk);
-               tcp_init_cwnd_reduction(sk, true);
+               tcp_init_cwnd_reduction(sk);
        }
        tcp_set_ca_state(sk, TCP_CA_Recovery);
 }
@@ -2677,18 +2686,16 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
  */
 static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
 {
-       struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        bool recovered = !before(tp->snd_una, tp->high_seq);
 
        if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
-               if (flag & FLAG_ORIG_SACK_ACKED) {
-                       /* Step 3.b. A timeout is spurious if not all data are
-                        * lost, i.e., never-retransmitted data are (s)acked.
-                        */
-                       tcp_try_undo_loss(sk, true);
+               /* Step 3.b. A timeout is spurious if not all data are
+                * lost, i.e., never-retransmitted data are (s)acked.
+                */
+               if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED))
                        return;
-               }
+
                if (after(tp->snd_nxt, tp->high_seq) &&
                    (flag & FLAG_DATA_SACKED || is_dupack)) {
                        tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
@@ -2704,12 +2711,9 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
 
        if (recovered) {
                /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
-               icsk->icsk_retransmits = 0;
                tcp_try_undo_recovery(sk);
                return;
        }
-       if (flag & FLAG_DATA_ACKED)
-               icsk->icsk_retransmits = 0;
        if (tcp_is_reno(tp)) {
                /* A Reno DUPACK means new data in F-RTO step 2.b above are
                 * delivered. Lower inflight to clock out (re)tranmissions.
@@ -2884,7 +2888,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 }
 
 static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
-                                     s32 seq_rtt, s32 sack_rtt)
+                                     long seq_rtt_us, long sack_rtt_us)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
 
@@ -2894,10 +2898,10 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
         * is acked (RFC6298).
         */
        if (flag & FLAG_RETRANS_DATA_ACKED)
-               seq_rtt = -1;
+               seq_rtt_us = -1L;
 
-       if (seq_rtt < 0)
-               seq_rtt = sack_rtt;
+       if (seq_rtt_us < 0)
+               seq_rtt_us = sack_rtt_us;
 
        /* RTTM Rule: A TSecr value received in a segment is used to
         * update the averaged RTT measurement only if the segment
@@ -2905,14 +2909,14 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
         * left edge of the send window.
         * See draft-ietf-tcplw-high-performance-00, section 3.3.
         */
-       if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+       if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
            flag & FLAG_ACKED)
-               seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+               seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr);
 
-       if (seq_rtt < 0)
+       if (seq_rtt_us < 0)
                return false;
 
-       tcp_rtt_estimator(sk, seq_rtt);
+       tcp_rtt_estimator(sk, seq_rtt_us);
        tcp_set_rto(sk);
 
        /* RFC6298: only reset backoff on valid RTT measurement. */
@@ -2924,22 +2928,23 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
 static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-       s32 seq_rtt = -1;
+       long seq_rtt_us = -1L;
 
        if (synack_stamp && !tp->total_retrans)
-               seq_rtt = tcp_time_stamp - synack_stamp;
+               seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp);
 
        /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
         * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
         */
-       if (!tp->srtt)
-               tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
+       if (!tp->srtt_us)
+               tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);
 }
 
-static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
+static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 {
        const struct inet_connection_sock *icsk = inet_csk(sk);
-       icsk->icsk_ca_ops->cong_avoid(sk, ack, acked, in_flight);
+
+       icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
        tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
 }
 
@@ -3022,26 +3027,32 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
  * arrived at the other end.
  */
 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
-                              u32 prior_snd_una, s32 sack_rtt)
+                              u32 prior_snd_una, long sack_rtt_us)
 {
-       struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
-       struct sk_buff *skb;
-       u32 now = tcp_time_stamp;
+       struct skb_mstamp first_ackt, last_ackt, now;
+       struct tcp_sock *tp = tcp_sk(sk);
+       u32 prior_sacked = tp->sacked_out;
+       u32 reord = tp->packets_out;
        bool fully_acked = true;
-       int flag = 0;
+       long ca_seq_rtt_us = -1L;
+       long seq_rtt_us = -1L;
+       struct sk_buff *skb;
        u32 pkts_acked = 0;
-       u32 reord = tp->packets_out;
-       u32 prior_sacked = tp->sacked_out;
-       s32 seq_rtt = -1;
-       s32 ca_seq_rtt = -1;
-       ktime_t last_ackt = net_invalid_timestamp();
        bool rtt_update;
+       int flag = 0;
+
+       first_ackt.v64 = 0;
 
        while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
+               struct skb_shared_info *shinfo = skb_shinfo(skb);
                struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
-               u32 acked_pcount;
                u8 sacked = scb->sacked;
+               u32 acked_pcount;
+
+               if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
+                   between(shinfo->tskey, prior_snd_una, tp->snd_una - 1))
+                       __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
 
                /* Determine how many packets and what bytes were acked, tso and else */
                if (after(scb->end_seq, tp->snd_una)) {
@@ -3063,11 +3074,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                                tp->retrans_out -= acked_pcount;
                        flag |= FLAG_RETRANS_DATA_ACKED;
                } else {
-                       ca_seq_rtt = now - scb->when;
-                       last_ackt = skb->tstamp;
-                       if (seq_rtt < 0) {
-                               seq_rtt = ca_seq_rtt;
-                       }
+                       last_ackt = skb->skb_mstamp;
+                       WARN_ON_ONCE(last_ackt.v64 == 0);
+                       if (!first_ackt.v64)
+                               first_ackt = last_ackt;
+
                        if (!(sacked & TCPCB_SACKED_ACKED))
                                reord = min(pkts_acked, reord);
                        if (!after(scb->end_seq, tp->high_seq))
@@ -3113,7 +3124,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
        if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
                flag |= FLAG_SACK_RENEGING;
 
-       rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt);
+       skb_mstamp_get(&now);
+       if (first_ackt.v64) {
+               seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
+               ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+       }
+
+       rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);
 
        if (flag & FLAG_ACKED) {
                const struct tcp_congestion_ops *ca_ops
@@ -3141,25 +3158,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 
                tp->fackets_out -= min(pkts_acked, tp->fackets_out);
 
-               if (ca_ops->pkts_acked) {
-                       s32 rtt_us = -1;
-
-                       /* Is the ACK triggering packet unambiguous? */
-                       if (!(flag & FLAG_RETRANS_DATA_ACKED)) {
-                               /* High resolution needed and available? */
-                               if (ca_ops->flags & TCP_CONG_RTT_STAMP &&
-                                   !ktime_equal(last_ackt,
-                                                net_invalid_timestamp()))
-                                       rtt_us = ktime_us_delta(ktime_get_real(),
-                                                               last_ackt);
-                               else if (ca_seq_rtt >= 0)
-                                       rtt_us = jiffies_to_usecs(ca_seq_rtt);
-                       }
+               if (ca_ops->pkts_acked)
+                       ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us);
 
-                       ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
-               }
-       } else if (skb && rtt_update && sack_rtt >= 0 &&
-                  sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) {
+       } else if (skb && rtt_update && sack_rtt_us >= 0 &&
+                  sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
                /* Do not re-arm RTO if the sack RTT is measured from data sent
                 * after when the head was last (re)transmitted. Otherwise the
                 * timeout may continue to extend in loss recovery.
@@ -3350,7 +3353,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
                tp->tlp_high_seq = 0;
                /* Don't reduce cwnd if DSACK arrives for TLP retrans. */
                if (!(flag & FLAG_DSACKING_ACK)) {
-                       tcp_init_cwnd_reduction(sk, true);
+                       tcp_init_cwnd_reduction(sk);
                        tcp_set_ca_state(sk, TCP_CA_CWR);
                        tcp_end_cwnd_reduction(sk);
                        tcp_try_keep_open(sk);
@@ -3369,12 +3372,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        u32 ack_seq = TCP_SKB_CB(skb)->seq;
        u32 ack = TCP_SKB_CB(skb)->ack_seq;
        bool is_dupack = false;
-       u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
        u32 prior_fackets;
        int prior_packets = tp->packets_out;
        const int prior_unsacked = tp->packets_out - tp->sacked_out;
        int acked = 0; /* Number of packets newly acked */
-       s32 sack_rtt = -1;
+       long sack_rtt_us = -1L;
 
        /* If the ack is older than previous acks
         * then we can probably ignore it.
@@ -3398,11 +3400,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
                tcp_rearm_rto(sk);
 
-       if (after(ack, prior_snd_una))
+       if (after(ack, prior_snd_una)) {
                flag |= FLAG_SND_UNA_ADVANCED;
+               icsk->icsk_retransmits = 0;
+       }
 
        prior_fackets = tp->fackets_out;
-       prior_in_flight = tcp_packets_in_flight(tp);
 
        /* ts_recent update must be made after we are sure that the packet
         * is in window.
@@ -3432,7 +3435,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
                if (TCP_SKB_CB(skb)->sacked)
                        flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
-                                                       &sack_rtt);
+                                                       &sack_rtt_us);
 
                if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
                        flag |= FLAG_ECE;
@@ -3451,12 +3454,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
        /* See if we can take anything off of the retransmit queue. */
        acked = tp->packets_out;
-       flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt);
+       flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
+                                   sack_rtt_us);
        acked -= tp->packets_out;
 
        /* Advance cwnd if state allows */
        if (tcp_may_raise_cwnd(sk, flag))
-               tcp_cong_avoid(sk, ack, acked, prior_in_flight);
+               tcp_cong_avoid(sk, ack, acked);
 
        if (tcp_ack_is_dubious(sk, flag)) {
                is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -3474,8 +3478,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
        if (icsk->icsk_pending == ICSK_TIME_RETRANS)
                tcp_schedule_loss_probe(sk);
-       if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
-               tcp_update_pacing_rate(sk);
+       tcp_update_pacing_rate(sk);
        return 1;
 
 no_queue:
@@ -3504,7 +3507,7 @@ old_ack:
         */
        if (TCP_SKB_CB(skb)->sacked) {
                flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
-                                               &sack_rtt);
+                                               &sack_rtt_us);
                tcp_fastretrans_alert(sk, acked, prior_unsacked,
                                      is_dupack, flag);
        }
@@ -4418,7 +4421,7 @@ queue_and_out:
                if (eaten > 0)
                        kfree_skb_partial(skb, fragstolen);
                if (!sock_flag(sk, SOCK_DEAD))
-                       sk->sk_data_ready(sk, 0);
+                       sk->sk_data_ready(sk);
                return;
        }
 
@@ -4708,28 +4711,6 @@ static int tcp_prune_queue(struct sock *sk)
        return -1;
 }
 
-/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
- * As additional protections, we do not touch cwnd in retransmission phases,
- * and if application hit its sndbuf limit recently.
- */
-void tcp_cwnd_application_limited(struct sock *sk)
-{
-       struct tcp_sock *tp = tcp_sk(sk);
-
-       if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
-           sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
-               /* Limited by application or receiver window. */
-               u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
-               u32 win_used = max(tp->snd_cwnd_used, init_win);
-               if (win_used < tp->snd_cwnd) {
-                       tp->snd_ssthresh = tcp_current_ssthresh(sk);
-                       tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
-               }
-               tp->snd_cwnd_used = 0;
-       }
-       tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-
 static bool tcp_should_expand_sndbuf(const struct sock *sk)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
@@ -4919,7 +4900,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
                                BUG();
                        tp->urg_data = TCP_URG_VALID | tmp;
                        if (!sock_flag(sk, SOCK_DEAD))
-                               sk->sk_data_ready(sk, 0);
+                               sk->sk_data_ready(sk);
                }
        }
 }
@@ -5148,19 +5129,15 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                        }
                } else {
                        int eaten = 0;
-                       int copied_early = 0;
                        bool fragstolen = false;
 
-                       if (tp->copied_seq == tp->rcv_nxt &&
-                           len - tcp_header_len <= tp->ucopy.len) {
-                               if (tp->ucopy.task == current &&
-                                   sock_owned_by_user(sk) && !copied_early) {
-                                       __set_current_state(TASK_RUNNING);
+                       if (tp->ucopy.task == current &&
+                           tp->copied_seq == tp->rcv_nxt &&
+                           len - tcp_header_len <= tp->ucopy.len &&
+                           sock_owned_by_user(sk)) {
+                               __set_current_state(TASK_RUNNING);
 
-                                       if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
-                                               eaten = 1;
-                               }
-                               if (eaten) {
+                               if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
                                        /* Predicted packet is in window by definition.
                                         * seq == rcv_nxt and rcv_wup <= rcv_nxt.
                                         * Hence, check seq<=rcv_wup reduces to:
@@ -5176,9 +5153,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                        __skb_pull(skb, tcp_header_len);
                                        tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
                                        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
+                                       eaten = 1;
                                }
-                               if (copied_early)
-                                       tcp_cleanup_rbuf(sk, skb->len);
                        }
                        if (!eaten) {
                                if (tcp_checksum_complete_user(sk, skb))
@@ -5215,12 +5191,11 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                        goto no_ack;
                        }
 
-                       if (!copied_early || tp->rcv_nxt != tp->rcv_wup)
-                               __tcp_ack_snd_check(sk, 0);
+                       __tcp_ack_snd_check(sk, 0);
 no_ack:
                        if (eaten)
                                kfree_skb_partial(skb, fragstolen);
-                       sk->sk_data_ready(sk, 0);
+                       sk->sk_data_ready(sk);
                        return;
                }
        }
@@ -5340,9 +5315,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
                                break;
                }
                tcp_rearm_rto(sk);
+               NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
                return true;
        }
        tp->syn_data_acked = tp->syn_data;
+       if (tp->syn_data_acked)
+               NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
        return false;
 }
 
@@ -5842,3 +5820,156 @@ discard:
        return 0;
 }
 EXPORT_SYMBOL(tcp_rcv_state_process);
+
+static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
+{
+       struct inet_request_sock *ireq = inet_rsk(req);
+
+       if (family == AF_INET)
+               LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
+                              &ireq->ir_rmt_addr, port);
+#if IS_ENABLED(CONFIG_IPV6)
+       else if (family == AF_INET6)
+               LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),
+                              &ireq->ir_v6_rmt_addr, port);
+#endif
+}
+
+int tcp_conn_request(struct request_sock_ops *rsk_ops,
+                    const struct tcp_request_sock_ops *af_ops,
+                    struct sock *sk, struct sk_buff *skb)
+{
+       struct tcp_options_received tmp_opt;
+       struct request_sock *req;
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct dst_entry *dst = NULL;
+       __u32 isn = TCP_SKB_CB(skb)->when;
+       bool want_cookie = false, fastopen;
+       struct flowi fl;
+       struct tcp_fastopen_cookie foc = { .len = -1 };
+       int err;
+
+
+       /* TW buckets are converted to open requests without
+        * limitations, they conserve resources and peer is
+        * evidently real one.
+        */
+       if ((sysctl_tcp_syncookies == 2 ||
+            inet_csk_reqsk_queue_is_full(sk)) && !isn) {
+               want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
+               if (!want_cookie)
+                       goto drop;
+       }
+
+
+       /* Accept backlog is full. If we have already queued enough
+        * of warm entries in syn queue, drop request. It is better than
+        * clogging syn queue with openreqs with exponentially increasing
+        * timeout.
+        */
+       if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
+               NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+               goto drop;
+       }
+
+       req = inet_reqsk_alloc(rsk_ops);
+       if (!req)
+               goto drop;
+
+       tcp_rsk(req)->af_specific = af_ops;
+
+       tcp_clear_options(&tmp_opt);
+       tmp_opt.mss_clamp = af_ops->mss_clamp;
+       tmp_opt.user_mss  = tp->rx_opt.user_mss;
+       tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
+
+       if (want_cookie && !tmp_opt.saw_tstamp)
+               tcp_clear_options(&tmp_opt);
+
+       tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
+       tcp_openreq_init(req, &tmp_opt, skb, sk);
+
+       af_ops->init_req(req, sk, skb);
+
+       if (security_inet_conn_request(sk, skb, req))
+               goto drop_and_free;
+
+       if (!want_cookie || tmp_opt.tstamp_ok)
+               TCP_ECN_create_request(req, skb, sock_net(sk));
+
+       if (want_cookie) {
+               isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
+               req->cookie_ts = tmp_opt.tstamp_ok;
+       } else if (!isn) {
+               /* VJ's idea. We save last timestamp seen
+                * from the destination in peer table, when entering
+                * state TIME-WAIT, and check against it before
+                * accepting new connection request.
+                *
+                * If "isn" is not zero, this request hit alive
+                * timewait bucket, so that all the necessary checks
+                * are made in the function processing timewait state.
+                */
+               if (tcp_death_row.sysctl_tw_recycle) {
+                       bool strict;
+
+                       dst = af_ops->route_req(sk, &fl, req, &strict);
+
+                       if (dst && strict &&
+                           !tcp_peer_is_proven(req, dst, true,
+                                               tmp_opt.saw_tstamp)) {
+                               NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
+                               goto drop_and_release;
+                       }
+               }
+               /* Kill the following clause, if you dislike this way. */
+               else if (!sysctl_tcp_syncookies &&
+                        (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+                         (sysctl_max_syn_backlog >> 2)) &&
+                        !tcp_peer_is_proven(req, dst, false,
+                                            tmp_opt.saw_tstamp)) {
+                       /* Without syncookies last quarter of
+                        * backlog is filled with destinations,
+                        * proven to be alive.
+                        * It means that we continue to communicate
+                        * to destinations, already remembered
+                        * to the moment of synflood.
+                        */
+                       pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
+                                   rsk_ops->family);
+                       goto drop_and_release;
+               }
+
+               isn = af_ops->init_seq(skb);
+       }
+       if (!dst) {
+               dst = af_ops->route_req(sk, &fl, req, NULL);
+               if (!dst)
+                       goto drop_and_free;
+       }
+
+       tcp_rsk(req)->snt_isn = isn;
+       tcp_openreq_init_rwin(req, sk, dst);
+       fastopen = !want_cookie &&
+                  tcp_try_fastopen(sk, skb, req, &foc, dst);
+       err = af_ops->send_synack(sk, dst, &fl, req,
+                                 skb_get_queue_mapping(skb), &foc);
+       if (!fastopen) {
+               if (err || want_cookie)
+                       goto drop_and_free;
+
+               tcp_rsk(req)->listener = NULL;
+               af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+       }
+
+       return 0;
+
+drop_and_release:
+       dst_release(dst);
+drop_and_free:
+       reqsk_free(req);
+drop:
+       NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+       return 0;
+}
+EXPORT_SYMBOL(tcp_conn_request);