forked from CachyOS/kernel-patches
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path0001-bbr2.patch
3285 lines (3241 loc) · 120 KB
/
0001-bbr2.patch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
From c97835d93d4ad192b2504e6ff1106410c02ca2e0 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Sun, 15 Jan 2023 16:50:23 +0100
Subject: [PATCH 01/16] bbr2
Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
include/linux/tcp.h | 3 +-
include/net/inet_connection_sock.h | 3 +-
include/net/tcp.h | 41 +-
include/uapi/linux/inet_diag.h | 33 +
net/ipv4/Kconfig | 22 +
net/ipv4/Makefile | 1 +
net/ipv4/tcp.c | 1 +
net/ipv4/tcp_bbr.c | 38 +-
net/ipv4/tcp_bbr2.c | 2674 ++++++++++++++++++++++++++++
net/ipv4/tcp_cong.c | 1 +
net/ipv4/tcp_input.c | 27 +-
net/ipv4/tcp_output.c | 26 +-
net/ipv4/tcp_rate.c | 30 +-
net/ipv4/tcp_timer.c | 1 +
14 files changed, 2867 insertions(+), 34 deletions(-)
create mode 100644 net/ipv4/tcp_bbr2.c
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 41b1da621a45..d8f94ef1a297 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -255,7 +255,8 @@ struct tcp_sock {
u8 compressed_ack;
u8 dup_ack_counter:2,
tlp_retrans:1, /* TLP is a retransmission */
- unused:5;
+ fast_ack_mode:2, /* which fast ack mode ? */
+ unused:3;
u32 chrono_start; /* Start time in jiffies of a TCP chrono */
u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
u8 chrono_type:2, /* current chronograph type */
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index c2b15f7e5516..d85858efa571 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -135,7 +135,8 @@ struct inet_connection_sock {
u32 icsk_probes_tstamp;
u32 icsk_user_timeout;
- u64 icsk_ca_priv[104 / sizeof(u64)];
+/* XXX inflated by temporary internal debugging info */
+ u64 icsk_ca_priv[216 / sizeof(u64)];
#define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv)
};
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5b70b241ce71..4ae0f55cf0e1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -375,6 +375,7 @@ static inline void tcp_dec_quickack_mode(struct sock *sk,
#define TCP_ECN_QUEUE_CWR 2
#define TCP_ECN_DEMAND_CWR 4
#define TCP_ECN_SEEN 8
+#define TCP_ECN_ECT_PERMANENT 16
enum tcp_tw_status {
TCP_TW_SUCCESS = 0,
@@ -823,6 +824,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
return max_t(s64, t1 - t0, 0);
}
+static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
+{
+ return max_t(s32, t1 - t0, 0);
+}
+
static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
{
return tcp_ns_to_ts(skb->skb_mstamp_ns);
@@ -898,9 +904,14 @@ struct tcp_skb_cb {
/* pkts S/ACKed so far upon tx of skb, incl retrans: */
__u32 delivered;
/* start of send pipeline phase */
- u64 first_tx_mstamp;
+ u32 first_tx_mstamp;
/* when we reached the "delivered" count */
- u64 delivered_mstamp;
+ u32 delivered_mstamp;
+#define TCPCB_IN_FLIGHT_BITS 20
+#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
+ u32 in_flight:20, /* packets in flight at transmit */
+ unused2:12;
+ u32 lost; /* packets lost so far upon tx of skb */
} tx; /* only used for outgoing skbs */
union {
struct inet_skb_parm h4;
@@ -1026,7 +1037,11 @@ enum tcp_ca_ack_event_flags {
#define TCP_CONG_NON_RESTRICTED 0x1
/* Requires ECN/ECT set on all packets */
#define TCP_CONG_NEEDS_ECN 0x2
-#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
+/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
+#define TCP_CONG_WANTS_CE_EVENTS 0x4
+#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \
+ TCP_CONG_NEEDS_ECN | \
+ TCP_CONG_WANTS_CE_EVENTS)
union tcp_cc_info;
@@ -1046,8 +1061,11 @@ struct ack_sample {
*/
struct rate_sample {
u64 prior_mstamp; /* starting timestamp for interval */
+ u32 prior_lost; /* tp->lost at "prior_mstamp" */
u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
+ u32 tx_in_flight; /* packets in flight at starting timestamp */
+ s32 lost; /* number of packets lost over interval */
s32 delivered; /* number of packets delivered over interval */
s32 delivered_ce; /* number of packets delivered w/ CE marks*/
long interval_us; /* time for tp->delivered to incr "delivered" */
@@ -1061,6 +1079,7 @@ struct rate_sample {
bool is_app_limited; /* is sample from packet with bubble in pipe? */
bool is_retrans; /* is sample from retransmission? */
bool is_ack_delayed; /* is this (likely) a delayed ACK? */
+ bool is_ece; /* did this ACK have ECN marked? */
};
struct tcp_congestion_ops {
@@ -1084,8 +1103,11 @@ struct tcp_congestion_ops {
/* hook for packet ack accounting (optional) */
void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
- /* override sysctl_tcp_min_tso_segs */
- u32 (*min_tso_segs)(struct sock *sk);
+ /* pick target number of segments per TSO/GSO skb (optional): */
+ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
+
+ /* react to a specific lost skb (optional) */
+ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
/* call when packets are delivered to update cwnd and pacing rate,
* after all the ca_state processing. (optional)
@@ -1148,6 +1170,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
}
#endif
+static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
+ TCP_CONG_WANTS_CE_EVENTS);
+}
+
static inline bool tcp_ca_needs_ecn(const struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1167,6 +1197,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
/* From tcp_rate.c */
+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
struct rate_sample *rs);
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 50655de04c9b..0e24f11627d5 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -231,9 +231,42 @@ struct tcp_bbr_info {
__u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */
};
+/* Phase as reported in netlink/ss stats. */
+enum tcp_bbr2_phase {
+ BBR2_PHASE_INVALID = 0,
+ BBR2_PHASE_STARTUP = 1,
+ BBR2_PHASE_DRAIN = 2,
+ BBR2_PHASE_PROBE_RTT = 3,
+ BBR2_PHASE_PROBE_BW_UP = 4,
+ BBR2_PHASE_PROBE_BW_DOWN = 5,
+ BBR2_PHASE_PROBE_BW_CRUISE = 6,
+ BBR2_PHASE_PROBE_BW_REFILL = 7
+};
+
+struct tcp_bbr2_info {
+ /* u64 bw: bandwidth (app throughput) estimate in Byte per sec: */
+ __u32 bbr_bw_lsb; /* lower 32 bits of bw */
+ __u32 bbr_bw_msb; /* upper 32 bits of bw */
+ __u32 bbr_min_rtt; /* min-filtered RTT in uSec */
+ __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */
+ __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */
+ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */
+ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */
+ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */
+ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */
+ __u8 bbr_mode; /* current bbr_mode in state machine */
+ __u8 bbr_phase; /* current state machine phase */
+ __u8 unused1; /* alignment padding; not used yet */
+ __u8 bbr_version; /* MUST be at this offset in struct */
+ __u32 bbr_inflight_lo; /* lower/short-term data volume bound */
+ __u32 bbr_inflight_hi; /* higher/long-term data volume bound */
+ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */
+};
+
union tcp_cc_info {
struct tcpvegas_info vegas;
struct tcp_dctcp_info dctcp;
struct tcp_bbr_info bbr;
+ struct tcp_bbr2_info bbr2;
};
#endif /* _UAPI_INET_DIAG_H_ */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 2dfb12230f08..b6bec331a82e 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -678,6 +678,24 @@ config TCP_CONG_BBR
AQM schemes that do not provide a delay signal. It requires the fq
("Fair Queue") pacing packet scheduler.
+config TCP_CONG_BBR2
+ tristate "BBR2 TCP"
+ default n
+ help
+
+ BBR2 TCP congestion control is a model-based congestion control
+ algorithm that aims to maximize network utilization, keep queues and
+ retransmit rates low, and to be able to coexist with Reno/CUBIC in
+ common scenarios. It builds an explicit model of the network path. It
+ tolerates a targeted degree of random packet loss and delay that are
+ unrelated to congestion. It can operate over LAN, WAN, cellular, wifi,
+ or cable modem links, and can use DCTCP-L4S-style ECN signals. It can
+ coexist with flows that use loss-based congestion control, and can
+ operate with shallow buffers, deep buffers, bufferbloat, policers, or
+ AQM schemes that do not provide a delay signal. It requires pacing,
+ using either TCP internal pacing or the fq ("Fair Queue") pacing packet
+ scheduler.
+
choice
prompt "Default TCP congestion control"
default DEFAULT_CUBIC
@@ -715,6 +733,9 @@ choice
config DEFAULT_BBR
bool "BBR" if TCP_CONG_BBR=y
+ config DEFAULT_BBR2
+ bool "BBR2" if TCP_CONG_BBR2=y
+
config DEFAULT_RENO
bool "Reno"
endchoice
@@ -739,6 +760,7 @@ config DEFAULT_TCP_CONG
default "dctcp" if DEFAULT_DCTCP
default "cdg" if DEFAULT_CDG
default "bbr" if DEFAULT_BBR
+ default "bbr2" if DEFAULT_BBR2
default "cubic"
config TCP_MD5SIG
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index bbdd9c44f14e..8dee1547d820 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
+obj-$(CONFIG_TCP_CONG_BBR2) += tcp_bbr2.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6667c3538f2a..34207ec358d8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3190,6 +3190,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->rx_opt.dsack = 0;
tp->rx_opt.num_sacks = 0;
tp->rcv_ooopack = 0;
+ tp->fast_ack_mode = 0;
/* Clean up fastopen related fields */
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 54eec33c6e1c..bfbf158c71f4 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -294,26 +294,40 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
sk->sk_pacing_rate = rate;
}
-/* override sysctl_tcp_min_tso_segs */
static u32 bbr_min_tso_segs(struct sock *sk)
{
return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
}
+/* Return the number of segments BBR would like in a TSO/GSO skb, given
+ * a particular max gso size as a constraint.
+ */
+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
+ u32 gso_max_size)
+{
+ u32 segs;
+ u64 bytes;
+
+ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
+ bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;
+
+ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
+ segs = max_t(u32, div_u64(bytes, mss_now), bbr_min_tso_segs(sk));
+ return segs;
+}
+
+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
+static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+{
+ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
+}
+
+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
static u32 bbr_tso_segs_goal(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- u32 segs, bytes;
-
- /* Sort of tcp_tso_autosize() but ignoring
- * driver provided sk_gso_max_size.
- */
- bytes = min_t(unsigned long,
- sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
- GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
- return min(segs, 0x7FU);
+ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE);
}
/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
@@ -1149,7 +1163,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
.undo_cwnd = bbr_undo_cwnd,
.cwnd_event = bbr_cwnd_event,
.ssthresh = bbr_ssthresh,
- .min_tso_segs = bbr_min_tso_segs,
+ .tso_segs = bbr_tso_segs,
.get_info = bbr_get_info,
.set_state = bbr_set_state,
};
diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c
new file mode 100644
index 000000000000..85f8052144d1
--- /dev/null
+++ b/net/ipv4/tcp_bbr2.c
@@ -0,0 +1,2674 @@
+/* BBR (Bottleneck Bandwidth and RTT) congestion control, v2
+ *
+ * BBRv2 is a model-based congestion control algorithm that aims for low
+ * queues, low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model
+ * of the network path, it uses measurements of bandwidth and RTT, as well as
+ * (if they occur) packet loss and/or DCTCP/L4S-style ECN signals. Note that
+ * although it can use ECN or loss signals explicitly, it does not require
+ * either; it can bound its in-flight data based on its estimate of the BDP.
+ *
+ * The model has both higher and lower bounds for the operating range:
+ * lo: bw_lo, inflight_lo: conservative short-term lower bound
+ * hi: bw_hi, inflight_hi: robust long-term upper bound
+ * The bandwidth-probing time scale is (a) extended dynamically based on
+ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
+ * an interactive wall-clock time-scale to be more scalable and responsive
+ * than Reno and CUBIC.
+ *
+ * Here is a state transition diagram for BBR:
+ *
+ * |
+ * V
+ * +---> STARTUP ----+
+ * | | |
+ * | V |
+ * | DRAIN ----+
+ * | | |
+ * | V |
+ * +---> PROBE_BW ----+
+ * | ^ | |
+ * | | | |
+ * | +----+ |
+ * | |
+ * +---- PROBE_RTT <--+
+ *
+ * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
+ * When it estimates the pipe is full, it enters DRAIN to drain the queue.
+ * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
+ * A long-lived BBR flow spends the vast majority of its time remaining
+ * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
+ * in a fair manner, with a small, bounded queue. *If* a flow has been
+ * continuously sending for the entire min_rtt window, and hasn't seen an RTT
+ * sample that matches or decreases its min_rtt estimate for 10 seconds, then
+ * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
+ * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
+ * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
+ * otherwise we enter STARTUP to try to fill the pipe.
+ *
+ * BBR is described in detail in:
+ * "BBR: Congestion-Based Congestion Control",
+ * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
+ * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
+ *
+ * There is a public e-mail list for discussing BBR development and testing:
+ * https://groups.google.com/forum/#!forum/bbr-dev
+ *
+ * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
+ * otherwise TCP stack falls back to an internal pacing using one high
+ * resolution timer per TCP socket and may use more resources.
+ */
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+#include <linux/inet.h>
+#include <linux/random.h>
+
+#include "tcp_dctcp.h"
+
+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+ * Since the minimum window is >=4 packets, the lower bound isn't
+ * an issue. The upper bound isn't an issue with existing technologies.
+ */
+#define BW_SCALE 24
+#define BW_UNIT (1 << BW_SCALE)
+
+#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */
+#define BBR_UNIT (1 << BBR_SCALE)
+
+#define FLAG_DEBUG_VERBOSE 0x1 /* Verbose debugging messages */
+#define FLAG_DEBUG_LOOPBACK 0x2 /* Do NOT skip loopback addr */
+
+#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */
+
+/* BBR has the following modes for deciding how fast to send: */
+enum bbr_mode {
+ BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */
+ BBR_DRAIN, /* drain any queue created during startup */
+ BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */
+ BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */
+};
+
+/* How does the incoming ACK stream relate to our bandwidth probing? */
+enum bbr_ack_phase {
+ BBR_ACKS_INIT, /* not probing; not getting probe feedback */
+ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */
+ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */
+ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */
+ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */
+};
+
+/* BBR congestion control block */
+struct bbr {
+ u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */
+ u32 min_rtt_stamp; /* timestamp of min_rtt_us */
+ u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */
+ u32 probe_rtt_min_us; /* min RTT in bbr_probe_rtt_win_ms window */
+ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/
+ u32 next_rtt_delivered; /* scb->tx.delivered at end of round */
+ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */
+ u64 cycle_mstamp; /* time of this cycle phase start */
+ u32 mode:3, /* current bbr_mode in state machine */
+ prev_ca_state:3, /* CA state on previous ACK */
+ packet_conservation:1, /* use packet conservation? */
+ round_start:1, /* start of packet-timed tx->ack round? */
+ ce_state:1, /* If most recent data has CE bit set */
+ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */
+ try_fast_path:1, /* can we take fast path? */
+ unused2:11,
+ idle_restart:1, /* restarting after idle? */
+ probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */
+ cycle_idx:3, /* current index in pacing_gain cycle array */
+ has_seen_rtt:1; /* have we seen an RTT sample yet? */
+ u32 pacing_gain:11, /* current gain for setting pacing rate */
+ cwnd_gain:11, /* current gain for setting cwnd */
+ full_bw_reached:1, /* reached full bw in Startup? */
+ full_bw_cnt:2, /* number of rounds without large bw gains */
+ init_cwnd:7; /* initial cwnd */
+ u32 prior_cwnd; /* prior cwnd upon entering loss recovery */
+ u32 full_bw; /* recent bw, to estimate if pipe is full */
+
+ /* For tracking ACK aggregation: */
+ u64 ack_epoch_mstamp; /* start of ACK sampling epoch */
+ u16 extra_acked[2]; /* max excess data ACKed in epoch */
+ u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */
+ extra_acked_win_rtts:5, /* age of extra_acked, in round trips */
+ extra_acked_win_idx:1, /* current index in extra_acked array */
+ /* BBR v2 state: */
+ unused1:2,
+ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */
+ loss_in_cycle:1, /* packet loss in this cycle? */
+ ecn_in_cycle:1; /* ECN in this cycle? */
+ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */
+ u32 undo_bw_lo; /* bw_lo before latest losses */
+ u32 undo_inflight_lo; /* inflight_lo before latest losses */
+ u32 undo_inflight_hi; /* inflight_hi before latest losses */
+ u32 bw_latest; /* max delivered bw in last round trip */
+ u32 bw_lo; /* lower bound on sending bandwidth */
+ u32 bw_hi[2]; /* upper bound of sending bandwidth range*/
+ u32 inflight_latest; /* max delivered data in last round trip */
+ u32 inflight_lo; /* lower bound of inflight data range */
+ u32 inflight_hi; /* upper bound of inflight data range */
+ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
+ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */
+ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */
+ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */
+ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */
+ bw_probe_samples:1, /* rate samples reflect bw probing? */
+ prev_probe_too_high:1, /* did last PROBE_UP go too high? */
+ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
+ rounds_since_probe:8, /* packet-timed rounds since probed bw */
+ loss_round_start:1, /* loss_round_delivered round trip? */
+ loss_in_round:1, /* loss marked in this round trip? */
+ ecn_in_round:1, /* ECN marked in this round trip? */
+ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */
+ loss_events_in_round:4,/* losses in STARTUP round */
+ initialized:1; /* has bbr_init() been called? */
+ u32 alpha_last_delivered; /* tp->delivered at alpha update */
+ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
+
+ /* Params configurable using setsockopt. Refer to correspoding
+ * module param for detailed description of params.
+ */
+ struct bbr_params {
+ u32 high_gain:11, /* max allowed value: 2047 */
+ drain_gain:10, /* max allowed value: 1023 */
+ cwnd_gain:11; /* max allowed value: 2047 */
+ u32 cwnd_min_target:4, /* max allowed value: 15 */
+ min_rtt_win_sec:5, /* max allowed value: 31 */
+ probe_rtt_mode_ms:9, /* max allowed value: 511 */
+ full_bw_cnt:3, /* max allowed value: 7 */
+ cwnd_tso_budget:1, /* allowed values: {0, 1} */
+ unused3:6,
+ drain_to_target:1, /* boolean */
+ precise_ece_ack:1, /* boolean */
+ extra_acked_in_startup:1, /* allowed values: {0, 1} */
+ fast_path:1; /* boolean */
+ u32 full_bw_thresh:10, /* max allowed value: 1023 */
+ startup_cwnd_gain:11, /* max allowed value: 2047 */
+ bw_probe_pif_gain:9, /* max allowed value: 511 */
+ usage_based_cwnd:1, /* boolean */
+ unused2:1;
+ u16 probe_rtt_win_ms:14, /* max allowed value: 16383 */
+ refill_add_inc:2; /* max allowed value: 3 */
+ u16 extra_acked_gain:11, /* max allowed value: 2047 */
+ extra_acked_win_rtts:5; /* max allowed value: 31*/
+ u16 pacing_gain[CYCLE_LEN]; /* max allowed value: 1023 */
+ /* Mostly BBR v2 parameters below here: */
+ u32 ecn_alpha_gain:8, /* max allowed value: 255 */
+ ecn_factor:8, /* max allowed value: 255 */
+ ecn_thresh:8, /* max allowed value: 255 */
+ beta:8; /* max allowed value: 255 */
+ u32 ecn_max_rtt_us:19, /* max allowed value: 524287 */
+ bw_probe_reno_gain:9, /* max allowed value: 511 */
+ full_loss_cnt:4; /* max allowed value: 15 */
+ u32 probe_rtt_cwnd_gain:8, /* max allowed value: 255 */
+ inflight_headroom:8, /* max allowed value: 255 */
+ loss_thresh:8, /* max allowed value: 255 */
+ bw_probe_max_rounds:8; /* max allowed value: 255 */
+ u32 bw_probe_rand_rounds:4, /* max allowed value: 15 */
+ bw_probe_base_us:26, /* usecs: 0..2^26-1 (67 secs) */
+ full_ecn_cnt:2; /* max allowed value: 3 */
+ u32 bw_probe_rand_us:26, /* usecs: 0..2^26-1 (67 secs) */
+ undo:1, /* boolean */
+ tso_rtt_shift:4, /* max allowed value: 15 */
+ unused5:1;
+ u32 ecn_reprobe_gain:9, /* max allowed value: 511 */
+ unused1:14,
+ ecn_alpha_init:9; /* max allowed value: 256 */
+ } params;
+
+ struct {
+ u32 snd_isn; /* Initial sequence number */
+ u32 rs_bw; /* last valid rate sample bw */
+ u32 target_cwnd; /* target cwnd, based on BDP */
+ u8 undo:1, /* Undo even happened but not yet logged */
+ unused:7;
+ char event; /* single-letter event debug codes */
+ u16 unused2;
+ } debug;
+};
+
+struct bbr_context {
+ u32 sample_bw;
+ u32 target_cwnd;
+ u32 log:1;
+};
+
+/* Window length of min_rtt filter (in sec). Max allowed value is 31 (0x1F) */
+static u32 bbr_min_rtt_win_sec = 10;
+/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode.
+ * Max allowed value is 511 (0x1FF).
+ */
+static u32 bbr_probe_rtt_mode_ms = 200;
+/* Window length of probe_rtt_min_us filter (in ms), and consequently the
+ * typical interval between PROBE_RTT mode entries.
+ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
+ */
+static u32 bbr_probe_rtt_win_ms = 5000;
+/* Skip TSO below the following bandwidth (bits/sec): */
+static int bbr_min_tso_rate = 1200000;
+
+/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
+ * in bigger TSO bursts. By default we cut the RTT-based allowance in half
+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
+ * is below 1500 bytes after 6 * ~500 usec = 3ms.
+ */
+static u32 bbr_tso_rtt_shift = 9; /* halve allowance per 2^9 usecs, 512us */
+
+/* Select cwnd TSO budget approach:
+ * 0: padding
+ * 1: flooring
+ */
+static uint bbr_cwnd_tso_budget = 1;
+
+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+ * In order to help drive the network toward lower queues and low latency while
+ * maintaining high utilization, the average pacing rate aims to be slightly
+ * lower than the estimated bandwidth. This is an important aspect of the
+ * design.
+ */
+static const int bbr_pacing_margin_percent = 1;
+
+/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
+ * that will allow a smoothly increasing pacing rate that will double each RTT
+ * and send the same number of packets per RTT that an un-paced, slow-starting
+ * Reno or CUBIC flow would. Max allowed value is 2047 (0x7FF).
+ */
+static int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1;
+/* The gain for deriving startup cwnd. Max allowed value is 2047 (0x7FF). */
+static int bbr_startup_cwnd_gain = BBR_UNIT * 2885 / 1000 + 1;
+/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
+ * the queue created in BBR_STARTUP in a single round. Max allowed value
+ * is 1023 (0x3FF).
+ */
+static int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs.
+ * Max allowed value is 2047 (0x7FF).
+ */
+static int bbr_cwnd_gain = BBR_UNIT * 2;
+/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw.
+ * Max allowed value for each element is 1023 (0x3FF).
+ */
+enum bbr_pacing_gain_phase {
+ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */
+ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */
+ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */
+ BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */
+};
+static int bbr_pacing_gain[] = {
+ BBR_UNIT * 5 / 4, /* probe for more available bw */
+ BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */
+ BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */
+ BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */
+};
+
+/* Try to keep at least this many packets in flight, if things go smoothly. For
+ * smooth functioning, a sliding window protocol ACKing every other packet
+ * needs at least 4 packets in flight. Max allowed value is 15 (0xF).
+ */
+static u32 bbr_cwnd_min_target = 4;
+
+/* Cwnd to BDP proportion in PROBE_RTT mode scaled by BBR_UNIT. Default: 50%.
+ * Use 0 to disable. Max allowed value is 255.
+ */
+static u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
+
+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
+/* If bw has increased significantly (1.25x), there may be more bw available.
+ * Max allowed value is 1023 (0x3FF).
+ */
+static u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+/* But after 3 rounds w/o significant bw growth, estimate pipe is full.
+ * Max allowed value is 7 (0x7).
+ */
+static u32 bbr_full_bw_cnt = 3;
+
+static u32 bbr_flags; /* Debugging related stuff */
+
+/* Whether to debug using printk.
+ */
+static bool bbr_debug_with_printk;
+
+/* Whether to debug using ftrace event tcp:tcp_bbr_event.
+ * Ignored when bbr_debug_with_printk is set.
+ */
+static bool bbr_debug_ftrace;
+
+/* Experiment: each cycle, try to hold sub-unity gain until inflight <= BDP. */
+static bool bbr_drain_to_target = true; /* default: enabled */
+
+/* Experiment: Flags to control BBR with ECN behavior.
+ */
+static bool bbr_precise_ece_ack = true; /* default: enabled */
+
+/* The max rwin scaling shift factor is 14 (RFC 1323), so the max sane rwin is
+ * (2^(16+14) B)/(1024 B/packet) = 1M packets.
+ */
+static u32 bbr_cwnd_warn_val = 1U << 20;
+
+static u16 bbr_debug_port_mask;
+
+/* BBR module parameters. These are module parameters only in Google prod.
+ * Upstream these are intentionally not module parameters.
+ */
+static int bbr_pacing_gain_size = CYCLE_LEN;
+
+/* Gain factor for adding extra_acked to target cwnd: */
+static int bbr_extra_acked_gain = 256;
+
+/* Window length of extra_acked window. Max allowed val is 31. */
+static u32 bbr_extra_acked_win_rtts = 5;
+
+/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */
+static u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
+
+/* Time period for clamping cwnd increment due to ack aggregation */
+static u32 bbr_extra_acked_max_us = 100 * 1000;
+
+/* Use extra acked in startup ?
+ * 0: disabled
+ * 1: use latest extra_acked value from 1-2 rtt in startup
+ */
+static int bbr_extra_acked_in_startup = 1; /* default: enabled */
+
+/* Experiment: don't grow cwnd beyond twice of what we just probed. */
+static bool bbr_usage_based_cwnd; /* default: disabled */
+
+/* For lab testing, researchers can enable BBRv2 ECN support with this flag,
+ * when they know that any ECN marks that the connections experience will be
+ * DCTCP/L4S-style ECN marks, rather than RFC3168 ECN marks.
+ * TODO(ncardwell): Production use of the BBRv2 ECN functionality depends on
+ * negotiation or configuration that is outside the scope of the BBRv2
+ * alpha release.
+ */
+static bool bbr_ecn_enable = false;
+
+module_param_named(min_tso_rate, bbr_min_tso_rate, int, 0644);
+module_param_named(tso_rtt_shift, bbr_tso_rtt_shift, int, 0644);
+module_param_named(high_gain, bbr_high_gain, int, 0644);
+module_param_named(drain_gain, bbr_drain_gain, int, 0644);
+module_param_named(startup_cwnd_gain, bbr_startup_cwnd_gain, int, 0644);
+module_param_named(cwnd_gain, bbr_cwnd_gain, int, 0644);
+module_param_array_named(pacing_gain, bbr_pacing_gain, int,
+ &bbr_pacing_gain_size, 0644);
+module_param_named(cwnd_min_target, bbr_cwnd_min_target, uint, 0644);
+module_param_named(probe_rtt_cwnd_gain,
+ bbr_probe_rtt_cwnd_gain, uint, 0664);
+module_param_named(cwnd_warn_val, bbr_cwnd_warn_val, uint, 0664);
+module_param_named(debug_port_mask, bbr_debug_port_mask, ushort, 0644);
+module_param_named(flags, bbr_flags, uint, 0644);
+module_param_named(debug_ftrace, bbr_debug_ftrace, bool, 0644);
+module_param_named(debug_with_printk, bbr_debug_with_printk, bool, 0644);
+module_param_named(min_rtt_win_sec, bbr_min_rtt_win_sec, uint, 0644);
+module_param_named(probe_rtt_mode_ms, bbr_probe_rtt_mode_ms, uint, 0644);
+module_param_named(probe_rtt_win_ms, bbr_probe_rtt_win_ms, uint, 0644);
+module_param_named(full_bw_thresh, bbr_full_bw_thresh, uint, 0644);
+module_param_named(full_bw_cnt, bbr_full_bw_cnt, uint, 0644);
+module_param_named(cwnd_tso_bduget, bbr_cwnd_tso_budget, uint, 0664);
+module_param_named(extra_acked_gain, bbr_extra_acked_gain, int, 0664);
+module_param_named(extra_acked_win_rtts,
+ bbr_extra_acked_win_rtts, uint, 0664);
+module_param_named(extra_acked_max_us,
+ bbr_extra_acked_max_us, uint, 0664);
+module_param_named(ack_epoch_acked_reset_thresh,
+ bbr_ack_epoch_acked_reset_thresh, uint, 0664);
+module_param_named(drain_to_target, bbr_drain_to_target, bool, 0664);
+module_param_named(precise_ece_ack, bbr_precise_ece_ack, bool, 0664);
+module_param_named(extra_acked_in_startup,
+ bbr_extra_acked_in_startup, int, 0664);
+module_param_named(usage_based_cwnd, bbr_usage_based_cwnd, bool, 0664);
+module_param_named(ecn_enable, bbr_ecn_enable, bool, 0664);
+
+static void bbr2_exit_probe_rtt(struct sock *sk);
+static void bbr2_reset_congestion_signals(struct sock *sk);
+
+static void bbr_check_probe_rtt_done(struct sock *sk);
+
+/* Do we estimate that STARTUP filled the pipe? */
+static bool bbr_full_bw_reached(const struct sock *sk)
+{
+ const struct bbr *bbr = inet_csk_ca(sk);
+
+ return bbr->full_bw_reached;
+}
+
+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+static u32 bbr_max_bw(const struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return max(bbr->bw_hi[0], bbr->bw_hi[1]);
+}
+
+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+static u32 bbr_bw(const struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return min(bbr_max_bw(sk), bbr->bw_lo);
+}
+
+/* Return maximum extra acked in past k-2k round trips,
+ * where k = bbr_extra_acked_win_rtts.
+ */
+static u16 bbr_extra_acked(const struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return max(bbr->extra_acked[0], bbr->extra_acked[1]);
+}
+
+/* Return rate in bytes per second, optionally with a gain.
+ * The order here is chosen carefully to avoid overflow of u64. This should
+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+ */
+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
+ int margin)
+{
+ unsigned int mss = tcp_sk(sk)->mss_cache;
+
+ rate *= mss;
+ rate *= gain;
+ rate >>= BBR_SCALE;
+ rate *= USEC_PER_SEC / 100 * (100 - margin);
+ rate >>= BW_SCALE;
+ rate = max(rate, 1ULL);
+ return rate;
+}
+
+static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
+{
+ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
+}
+
+static u64 bbr_rate_kbps(struct sock *sk, u64 rate)
+{
+ rate = bbr_bw_bytes_per_sec(sk, rate);
+ rate *= 8;
+ do_div(rate, 1000);
+ return rate;
+}
+
+static u32 bbr_tso_segs_goal(struct sock *sk);
+static void bbr_debug(struct sock *sk, u32 acked,
+ const struct rate_sample *rs, struct bbr_context *ctx)
+{
+ static const char ca_states[] = {
+ [TCP_CA_Open] = 'O',
+ [TCP_CA_Disorder] = 'D',
+ [TCP_CA_CWR] = 'C',
+ [TCP_CA_Recovery] = 'R',
+ [TCP_CA_Loss] = 'L',
+ };
+ static const char mode[] = {
+ 'G', /* Growing - BBR_STARTUP */
+ 'D', /* Drain - BBR_DRAIN */
+ 'W', /* Window - BBR_PROBE_BW */
+ 'M', /* Min RTT - BBR_PROBE_RTT */
+ };
+ static const char ack_phase[] = { /* bbr_ack_phase strings */
+ 'I', /* BBR_ACKS_INIT - 'Init' */
+ 'R', /* BBR_ACKS_REFILLING - 'Refilling' */
+ 'B', /* BBR_ACKS_PROBE_STARTING - 'Before' */
+ 'F', /* BBR_ACKS_PROBE_FEEDBACK - 'Feedback' */
+ 'A', /* BBR_ACKS_PROBE_STOPPING - 'After' */
+ };
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ const u32 una = tp->snd_una - bbr->debug.snd_isn;
+ const u32 fack = tcp_highest_sack_seq(tp);
+ const u16 dport = ntohs(inet_sk(sk)->inet_dport);
+ bool is_port_match = (bbr_debug_port_mask &&
+ ((dport & bbr_debug_port_mask) == 0));
+ char debugmsg[320];
+
+ if (sk->sk_state == TCP_SYN_SENT)
+ return; /* no bbr_init() yet if SYN retransmit -> CA_Loss */
+
+ if (!tp->snd_cwnd || tp->snd_cwnd > bbr_cwnd_warn_val) {
+ char addr[INET6_ADDRSTRLEN + 10] = { 0 };
+
+ if (sk->sk_family == AF_INET)
+ snprintf(addr, sizeof(addr), "%pI4:%u",
+ &inet_sk(sk)->inet_daddr, dport);
+ else if (sk->sk_family == AF_INET6)
+ snprintf(addr, sizeof(addr), "%pI6:%u",
+ &sk->sk_v6_daddr, dport);
+
+ WARN_ONCE(1,
+ "BBR %s cwnd alert: %u "
+ "snd_una: %u ca: %d pacing_gain: %u cwnd_gain: %u "
+ "bw: %u rtt: %u min_rtt: %u "
+ "acked: %u tso_segs: %u "
+ "bw: %d %ld %d pif: %u\n",
+ addr, tp->snd_cwnd,
+ una, inet_csk(sk)->icsk_ca_state,
+ bbr->pacing_gain, bbr->cwnd_gain,
+ bbr_max_bw(sk), (tp->srtt_us >> 3), bbr->min_rtt_us,
+ acked, bbr_tso_segs_goal(sk),
+ rs->delivered, rs->interval_us, rs->is_retrans,
+ tcp_packets_in_flight(tp));
+ }
+
+ if (likely(!bbr_debug_with_printk && !bbr_debug_ftrace))
+ return;
+
+ if (!sock_flag(sk, SOCK_DBG) && !is_port_match)
+ return;
+
+ if (!ctx->log && !tp->app_limited && !(bbr_flags & FLAG_DEBUG_VERBOSE))
+ return;
+
+ if (ipv4_is_loopback(inet_sk(sk)->inet_daddr) &&
+ !(bbr_flags & FLAG_DEBUG_LOOPBACK))
+ return;
+
+ snprintf(debugmsg, sizeof(debugmsg) - 1,
+ "BBR %pI4:%-5u %5u,%03u:%-7u %c "
+ "%c %2u br %2u cr %2d rtt %5ld d %2d i %5ld mrtt %d %cbw %llu "
+ "bw %llu lb %llu ib %llu qb %llu "
+ "a %u if %2u %c %c dl %u l %u al %u # %u t %u %c %c "
+ "lr %d er %d ea %d bwl %lld il %d ih %d c %d "
+ "v %d %c %u %c %s\n",
+ &inet_sk(sk)->inet_daddr, dport,
+ una / 1000, una % 1000, fack - tp->snd_una,
+ ca_states[inet_csk(sk)->icsk_ca_state],
+ bbr->debug.undo ? '@' : mode[bbr->mode],
+ tp->snd_cwnd,
+ bbr_extra_acked(sk), /* br (legacy): extra_acked */
+ rs->tx_in_flight, /* cr (legacy): tx_inflight */
+ rs->rtt_us,
+ rs->delivered,
+ rs->interval_us,
+ bbr->min_rtt_us,
+ rs->is_app_limited ? '_' : 'l',
+ bbr_rate_kbps(sk, ctx->sample_bw), /* lbw: latest sample bw */
+ bbr_rate_kbps(sk, bbr_max_bw(sk)), /* bw: max bw */
+ 0ULL, /* lb: [obsolete] */
+ 0ULL, /* ib: [obsolete] */
+ div_u64((u64)sk->sk_pacing_rate * 8, 1000),
+ acked,
+ tcp_packets_in_flight(tp),
+ rs->is_ack_delayed ? 'd' : '.',
+ bbr->round_start ? '*' : '.',
+ tp->delivered, tp->lost,
+ tp->app_limited,
+ 0, /* #: [obsolete] */
+ ctx->target_cwnd,
+ tp->reord_seen ? 'r' : '.', /* r: reordering seen? */
+ ca_states[bbr->prev_ca_state],
+ (rs->lost + rs->delivered) > 0 ?
+ (1000 * rs->lost /
+ (rs->lost + rs->delivered)) : 0, /* lr: loss rate x1000 */
+ (rs->delivered) > 0 ?
+ (1000 * rs->delivered_ce /
+ (rs->delivered)) : 0, /* er: ECN rate x1000 */
+ 1000 * bbr->ecn_alpha >> BBR_SCALE, /* ea: ECN alpha x1000 */
+ bbr->bw_lo == ~0U ?
+ -1 : (s64)bbr_rate_kbps(sk, bbr->bw_lo), /* bwl */
+ bbr->inflight_lo, /* il */
+ bbr->inflight_hi, /* ih */
+ bbr->bw_probe_up_cnt, /* c */
+ 2, /* v: version */
+ bbr->debug.event,
+ bbr->cycle_idx,
+ ack_phase[bbr->ack_phase],
+ bbr->bw_probe_samples ? "Y" : "N");
+ debugmsg[sizeof(debugmsg) - 1] = 0;
+
+ /* printk takes a higher precedence. */
+ if (bbr_debug_with_printk)
+ printk(KERN_DEBUG "%s", debugmsg);
+
+ if (unlikely(bbr->debug.undo))
+ bbr->debug.undo = 0;
+}
+
+/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
+static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+{
+ u64 rate = bw;
+
+ rate = bbr_rate_bytes_per_sec(sk, rate, gain,
+ bbr_pacing_margin_percent);
+ rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+ return rate;
+}
+
+/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
+static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw;
+ u32 rtt_us;
+
+ if (tp->srtt_us) { /* any RTT sample yet? */