while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
cwnd >>= 1;
tp->snd_cwnd = max(cwnd, restart_cwnd);
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->snd_cwnd_stamp = tcp_jiffies32;
tp->snd_cwnd_used = 0;
}
struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- const u32 now = tcp_time_stamp;
+ const u32 now = tcp_jiffies32;
if (tcp_packets_in_flight(tp) == 0)
tcp_ca_event(sk, CA_EVENT_TX_START);
sk_free(sk);
}
+/* Note: Called under hard irq.
+ * We can not call TCP stack right away.
+ */
+enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
+{
+ struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
+ struct sock *sk = (struct sock *)tp;
+ unsigned long nval, oval;
+
+ for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
+ struct tsq_tasklet *tsq;
+ bool empty;
+
+ if (oval & TSQF_QUEUED)
+ break;
+
+ nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
+ nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
+ if (nval != oval)
+ continue;
+
+ if (!atomic_inc_not_zero(&sk->sk_wmem_alloc))
+ break;
+ /* queue this socket to tasklet queue */
+ tsq = this_cpu_ptr(&tsq_tasklet);
+ empty = list_empty(&tsq->head);
+ list_add(&tp->tsq_node, &tsq->head);
+ if (empty)
+ tasklet_schedule(&tsq->tasklet);
+ break;
+ }
+ return HRTIMER_NORESTART;
+}
+
+/* BBR congestion control needs pacing.
+ * Same remark for SO_MAX_PACING_RATE.
+ * sch_fq packet scheduler is efficiently handling pacing,
+ * but is not always installed/used.
+ * Return true if TCP stack should pace packets itself.
+ */
+static bool tcp_needs_internal_pacing(const struct sock *sk)
+{
+ return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
+}
+
+static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
+{
+ u64 len_ns;
+ u32 rate;
+
+ if (!tcp_needs_internal_pacing(sk))
+ return;
+ rate = sk->sk_pacing_rate;
+ if (!rate || rate == ~0U)
+ return;
+
+ /* Should account for header sizes as sch_fq does,
+ * but lets make things simple.
+ */
+ len_ns = (u64)skb->len * NSEC_PER_SEC;
+ do_div(len_ns, rate);
+ hrtimer_start(&tcp_sk(sk)->pacing_timer,
+ ktime_add_ns(ktime_get(), len_ns),
+ HRTIMER_MODE_ABS_PINNED);
+}
+
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
BUG_ON(!skb || !tcp_skb_pcount(skb));
tp = tcp_sk(sk);
+ skb->skb_mstamp = tp->tcp_mstamp;
if (clone_it) {
- skb_mstamp_get(&skb->skb_mstamp);
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una;
tcp_rate_skb_sent(sk, skb);
if (skb->len != tcp_header_size) {
tcp_event_data_sent(tp, sk);
tp->data_segs_out += tcp_skb_pcount(skb);
+ tcp_internal_pacing(sk, skb);
}
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
return 0;
}
-/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
- * eventually). The difference is that pulled data not copied, but
- * immediately discarded.
+/* This is similar to __pskb_pull_tail(). The difference is that pulled
+ * data is not copied, but immediately discarded.
*/
static int __pskb_trim_head(struct sk_buff *skb, int len)
{
}
shinfo->nr_frags = k;
- skb_reset_tail_pointer(skb);
skb->data_len -= len;
skb->len = skb->data_len;
return len;
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
icsk->icsk_mtup.probe_size = 0;
if (icsk->icsk_mtup.enabled)
- icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
+ icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
}
EXPORT_SYMBOL(tcp_mtup_init);
}
tp->snd_cwnd_used = 0;
}
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->snd_cwnd_stamp = tcp_jiffies32;
}
static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
if (tcp_is_cwnd_limited(sk)) {
/* Network is feed fully. */
tp->snd_cwnd_used = 0;
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->snd_cwnd_stamp = tcp_jiffies32;
} else {
/* Network starves. */
if (tp->packets_out > tp->snd_cwnd_used)
tp->snd_cwnd_used = tp->packets_out;
if (sysctl_tcp_slow_start_after_idle &&
- (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
+ (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
!ca_ops->cong_control)
tcp_cwnd_application_limited(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 age, send_win, cong_win, limit, in_flight;
struct tcp_sock *tp = tcp_sk(sk);
- struct skb_mstamp now;
struct sk_buff *head;
int win_divisor;
/* Avoid bursty behavior by allowing defer
* only if the last write was recent.
*/
- if ((s32)(tcp_time_stamp - tp->lsndtime) > 0)
+ if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0)
goto send_now;
in_flight = tcp_packets_in_flight(tp);
}
head = tcp_write_queue_head(sk);
- skb_mstamp_get(&now);
- age = skb_mstamp_us_delta(&now, &head->skb_mstamp);
+
+ age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
/* If next ACK is likely to come too late (half srtt), do not defer */
if (age < (tp->srtt_us >> 4))
goto send_now;
s32 delta;
interval = net->ipv4.sysctl_tcp_probe_interval;
- delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp;
+ delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
if (unlikely(delta >= interval * HZ)) {
int mss = tcp_current_mss(sk);
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
/* Update probe time stamp */
- icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
+ icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
}
}
return -1;
}
+static bool tcp_pacing_check(const struct sock *sk)
+{
+ return tcp_needs_internal_pacing(sk) &&
+ hrtimer_active(&tcp_sk(sk)->pacing_timer);
+}
+
/* TCP Small Queues :
* Control number of packets in qdisc/devices to two packets / or ~1 ms.
* (These limits are doubled for retransmits)
static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
{
- const u32 now = tcp_time_stamp;
+ const u32 now = tcp_jiffies32;
if (tp->chrono_type > TCP_CHRONO_UNSPEC)
tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start;
}
max_segs = tcp_tso_segs(sk, mss_now);
+ tcp_mstamp_refresh(tp);
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
+ if (tcp_pacing_check(sk))
+ break;
+
tso_segs = tcp_init_tso_segs(skb, mss_now);
BUG_ON(!tso_segs);
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
/* "skb_mstamp" is used as a start point for the retransmit timer */
- skb_mstamp_get(&skb->skb_mstamp);
+ skb->skb_mstamp = tp->tcp_mstamp;
goto repair; /* Skip network transmission */
}
timeout = max_t(u32, timeout, msecs_to_jiffies(10));
/* If RTO is shorter, just schedule TLP in its place. */
- tlp_time_stamp = tcp_time_stamp + timeout;
+ tlp_time_stamp = tcp_jiffies32 + timeout;
rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
- s32 delta = rto_time_stamp - tcp_time_stamp;
+ s32 delta = rto_time_stamp - tcp_jiffies32;
if (delta > 0)
timeout = delta;
}
skb_headroom(skb) >= 0xFFFF)) {
struct sk_buff *nskb;
- skb_mstamp_get(&skb->skb_mstamp);
+ skb->skb_mstamp = tp->tcp_mstamp;
nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
-ENOBUFS;
if (skb == tcp_send_head(sk))
break;
+
+ if (tcp_pacing_check(sk))
+ break;
+
/* we could do better than to assign each time */
if (!hole)
tp->retransmit_skb_hint = skb;
skb_reserve(skb, MAX_TCP_HEADER);
tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
TCPHDR_ACK | TCPHDR_RST);
- skb_mstamp_get(&skb->skb_mstamp);
+ tcp_mstamp_refresh(tcp_sk(sk));
/* Send it off. */
if (tcp_transmit_skb(sk, skb, 0, priority))
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
if (unlikely(req->cookie_ts))
- skb->skb_mstamp.stamp_jiffies = cookie_init_timestamp(req);
+ skb->skb_mstamp = cookie_init_timestamp(req);
else
#endif
- skb_mstamp_get(&skb->skb_mstamp);
+ skb->skb_mstamp = tcp_clock_us();
#ifdef CONFIG_TCP_MD5SIG
rcu_read_lock();
if (likely(!tp->repair))
tp->rcv_nxt = 0;
else
- tp->rcv_tstamp = tcp_time_stamp;
+ tp->rcv_tstamp = tcp_jiffies32;
tp->rcv_wup = tp->rcv_nxt;
tp->copied_seq = tp->rcv_nxt;
return -ENOBUFS;
tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
- tp->retrans_stamp = tcp_time_stamp;
+ tcp_mstamp_refresh(tp);
+ tp->retrans_stamp = tcp_time_stamp(tp);
tcp_connect_queue_skb(sk, buff);
tcp_ecn_send_syn(sk, buff);
skb_set_tcp_pure_ack(buff);
/* Send it off, this clears delayed acks for us. */
- skb_mstamp_get(&buff->skb_mstamp);
tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0);
}
EXPORT_SYMBOL_GPL(tcp_send_ack);
* send it.
*/
tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
- skb_mstamp_get(&skb->skb_mstamp);
NET_INC_STATS(sock_net(sk), mib);
return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
}
+/* Called from setsockopt( ... TCP_REPAIR ) */
void tcp_send_window_probe(struct sock *sk)
{
if (sk->sk_state == TCP_ESTABLISHED) {
tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
+ tcp_mstamp_refresh(tcp_sk(sk));
tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
}
}